### Imports

In [1]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

### Data Preparation

#### Read from CSV

In [2]:
df = pd.read_csv("../data/review_2022_clean.csv")
df.head()

Unnamed: 0,text,label
0,bought fender telecaster salesperson told orig...,negative
1,visit son restaurant desert home food authenti...,positive
2,danielle great job listened cut hair way reque...,positive
3,saw lot roaches bathroom woke bed large dark s...,negative
4,ordered pork fried rice beef chow mei fun teri...,negative


In [3]:
df_sw = pd.read_csv("../data/review_2022_clean_sw.csv")
df_sw.head()

Unnamed: 0,text,label
0,i bought a fender telecaster that the salesper...,negative
1,this is our go to for take out when i visit my...,positive
2,danielle did a great job she listened and cut ...,positive
3,we saw a lot of roaches in the bathroom when w...,negative
4,we ordered pork fried rice and beef chow mei f...,negative


#### Perform Train-Test Split

Create a function to perform train-test split.

In [4]:
def split(stop_words=False):
  if stop_words:
    X = df_sw["text"]
    y = df_sw["label"]
  else:
    X = df["text"]
    y = df["label"]
  return train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
X_train, X_test, y_train, y_test = split()

In [6]:
X_train_sw, X_test_sw, y_train_sw, y_test_sw = split(stop_words=True)

#### Perform Vectorization

Prepare data for these scenarios:
1. unigrams
2. unigrams + stop words
3. bigrams
4. bigrams + stop words
5. unigrams + bigrams
6. unigrams + bigrams + stop words

Create a function to perform vectorization.

In [7]:
def vectorize(ngram_range=(1, 1), stop_words=False):
  print("ngram_range:", ngram_range)
  print("stop_words:", stop_words)

  tfidf = TfidfVectorizer(ngram_range=ngram_range)
  
  if stop_words:
    X_train_matrix = tfidf.fit_transform(X_train_sw)
    X_test_matrix = tfidf.transform(X_test_sw)
  else:
    X_train_matrix = tfidf.fit_transform(X_train)
    X_test_matrix = tfidf.transform(X_test)

  print("X_train_matrix shape:", X_train_matrix.shape)
  print("X_test_matrix shape:", X_test_matrix.shape)
  
  return (X_train_matrix, X_test_matrix)

##### Unigrams

In [8]:
X_train_uni, X_test_uni = vectorize()

ngram_range: (1, 1)
stop_words: False
X_train_matrix shape: (25332, 33634)
X_test_matrix shape: (6333, 33634)


##### Unigrams + Stop Words

In [9]:
X_train_uni_sw, X_test_uni_sw = vectorize(stop_words=True)

ngram_range: (1, 1)
stop_words: True
X_train_matrix shape: (25332, 33926)
X_test_matrix shape: (6333, 33926)


##### Bigrams

In [10]:
X_train_bi, X_test_bi = vectorize((2, 2))

ngram_range: (2, 2)
stop_words: False
X_train_matrix shape: (25332, 613031)
X_test_matrix shape: (6333, 613031)


##### Bigrams + Stop Words

In [11]:
X_train_bi_sw, X_test_bi_sw = vectorize((2, 2), True)

ngram_range: (2, 2)
stop_words: True
X_train_matrix shape: (25332, 549842)
X_test_matrix shape: (6333, 549842)


##### Unigrams + Bigrams

In [12]:
X_train_uni_bi, X_test_uni_bi = vectorize((1, 2))

ngram_range: (1, 2)
stop_words: False
X_train_matrix shape: (25332, 646665)
X_test_matrix shape: (6333, 646665)


##### Unigrams + Bigrams + Stop Words

In [13]:
X_train_uni_bi_sw, X_test_uni_bi_sw = vectorize((1, 2), True)

ngram_range: (1, 2)
stop_words: True
X_train_matrix shape: (25332, 583768)
X_test_matrix shape: (6333, 583768)


#### Export Objects

Create a function to export objects.

In [14]:
def export_objects(obj, file_name):
  path = "../data/" + file_name
  with open(path, "wb") as f:
    pickle.dump(obj, f)

In [15]:
export_list = [
  (y_train, "y_train.pickle"),
  (y_test, "y_test.pickle"),
  (X_train_uni, "X_train_uni.pickle"),
  (X_test_uni, "X_test_uni.pickle"),
  (X_train_uni_sw, "X_train_uni_sw.pickle"),
  (X_test_uni_sw, "X_test_uni_sw.pickle"),
  (X_train_bi, "X_train_bi.pickle"),
  (X_test_bi, "X_test_bi.pickle"),
  (X_train_bi_sw, "X_train_bi_sw.pickle"),
  (X_test_bi_sw, "X_test_bi_sw.pickle"),
  (X_train_uni_bi, "X_train_uni_bi.pickle"),
  (X_test_uni_bi, "X_test_uni_bi.pickle"),
  (X_train_uni_bi_sw, "X_train_uni_bi_sw.pickle"),
  (X_test_uni_bi_sw, "X_test_uni_bi_sw.pickle")
]

for obj, file_name in export_list:
  export_objects(obj, file_name)