### Imports

In [1]:
import pandas as pd
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

### Data Cleaning

#### Read from CSV

In [3]:
df = pd.read_csv("../data/review_2022.csv")
df.head()

Unnamed: 0,text,label
0,I bought a Fender 1966 Telecaster that the sal...,negative
1,This is our go to for take out when I visit my...,positive
2,Danielle did a great job! She listened and cu...,positive
3,We saw a lot of roaches in the bathroom when w...,negative
4,We Ordered pork fried rice and beef chow mei ...,negative


#### Clean Data

Create 2 sets of data for analysis and further preprocessing:
1. lower case
2. lower case + stop words

Create a function to clean text.

In [4]:
def clean_text(text, stop_words=False):
  output = []
  tokens = nlp(text.lower())

  for token in tokens:
    if stop_words:
      if token.is_alpha:
        output.append(str(token))
    else:
      if (token.is_alpha) and (not token.is_stop):
        output.append(str(token))
  
  return " ".join(output)

##### Lower Case

In [5]:
df_clean = df.copy()
df_clean["text"] = df_clean["text"].apply(clean_text)
df_clean.head()

Unnamed: 0,text,label
0,bought fender telecaster salesperson told orig...,negative
1,visit son restaurant desert home food authenti...,positive
2,danielle great job listened cut hair way reque...,positive
3,saw lot roaches bathroom woke bed large dark s...,negative
4,ordered pork fried rice beef chow mei fun teri...,negative


##### Lower Case + Stop Words

In [6]:
df_clean_sw = df.copy()
df_clean_sw["text"] = df_clean_sw["text"].apply(lambda text: clean_text(text, True))
df_clean_sw.head()

Unnamed: 0,text,label
0,i bought a fender telecaster that the salesper...,negative
1,this is our go to for take out when i visit my...,positive
2,danielle did a great job she listened and cut ...,positive
3,we saw a lot of roaches in the bathroom when w...,negative
4,we ordered pork fried rice and beef chow mei f...,negative


#### Export to CSV

In [7]:
df_clean.to_csv("../data/review_2022_clean.csv", index=False)

In [8]:
df_clean_sw.to_csv("../data/review_2022_clean_sw.csv", index=False)