# Tf-IDF
Use tf-idf tokenizer and costum CNN model.

## Import Packages and Environment Variables

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tqdm.auto as tqdm
import os

In [3]:
DATA_PATH = "data/"
DIR_PATH = os.getcwd()
TRAIN_DATA_PATH = os.path.join(DIR_PATH, DATA_PATH, "train.csv")
TEST_DATA_PATH = os.path.join(DIR_PATH, DATA_PATH, "test.csv")

VALIDATION_RATIO = 0.05
SEED = 1234

REMOVE_DUPLICATE = False
TRANSFORMER_NAME = 'bert-base-uncased'

BATCH_SIZE = 16
LEARNING_RATE = 5e-7
EPOCHS = 7

In [4]:
np.random.seed(SEED)
# torch.manual_seed(SEED)
# torch.cuda.manual_seed(SEED)
# torch.backends.cudnn.deterministic = True

In [5]:
tqdm.tqdm.pandas()# enable progress_apply and progress_mapfor pandas

## Data Preprocessed and Feature Engineering

In [6]:
train_data = pd.read_csv(TRAIN_DATA_PATH)
test = pd.read_csv(TEST_DATA_PATH)
train_data.head()

Unnamed: 0,review,sentiment
0,the single worst film i've ever seen in a thea...,negative
1,I was actually around 13 years old camping nea...,positive
2,A small town is attacked by a horde of bloodth...,negative
3,I think the problem with this show not getting...,positive
4,"Wow, this movie was horrible. As a Bills fan I...",negative


In [7]:
# Remove Duplicated Data
if REMOVE_DUPLICATE:
    print(train_data.duplicated(subset=["review"]).sum())
    train_data.drop_duplicates(subset=["review"], inplace=True)
    train_data = train_data.reset_index(drop=True)

In [8]:
train_data["sentiment"] = train_data["sentiment"].progress_map(
    lambda x:1 if x == "positive" else 0
    )

  0%|          | 0/40000 [00:00<?, ?it/s]

## Split trainning  set and validation set

In [9]:
train_data, valid_data = train_test_split(train_data, 
                                          test_size=VALIDATION_RATIO,
                                          random_state=SEED)
train_data.reset_index(drop=True, inplace=True)
valid_data.reset_index(drop=True, inplace=True)

### Train TF-IDF tokenizer

In [10]:
vectorizer = TfidfVectorizer()
vectorizer.fit(train_data['review'])

In [11]:
vectorized_train = vectorizer.transform(train_data['review'])
vectorized_valid = vectorizer.transform(valid_data['review'])
vectorized_test = vectorizer.transform(test['review'])

## Dataset and Dataloader

In [12]:
# class dataset(Dataset):
#     """
#     Custom dataset
#     Args:
#         data (pd.DataFrame): DataFrame containing 'input_ids' and 'sentiment' columns.
#     """
#     def __init__(self, data, labels):
#         super().__init__()
#         self.ids = data
#         self.labels = labels.tolist()
    
#     def __len__(self):
#         return len(self.labels)
    
#     def __getitem__(self, ind):
#         ids_tensor = torch.tensor(self.ids[ind], dtype=torch.int64)

#         labels_tensor = torch.tensor(self.labels[ind], dtype=torch.int64)
#         return {"ids" : ids_tensor, "label" : labels_tensor}

In [13]:
# train_dataset = dataset(vectorized_train, train_data['sentiment'])
# valid_dataset = dataset(vectorized_valid, valid_data['sentiment'])

In [14]:
# train_loader = DataLoader(
#         train_dataset,
#         batch_size=BATCH_SIZE,
#         shuffle=True
#     )
# valid_loader = DataLoader(
#         valid_dataset,
#         batch_size=BATCH_SIZE,
#         shuffle=False
#     )

## Model 

### Binomial Naive bayes

In [15]:
bnb = BernoulliNB()
bnb.fit(vectorized_train, train_data['sentiment'])

In [16]:
pred_valid = bnb.predict(vectorized_valid)
print(classification_report(valid_data['sentiment'], pred_valid, digits=4))

              precision    recall  f1-score   support

           0     0.8190    0.8751    0.8462       993
           1     0.8679    0.8093    0.8376      1007

    accuracy                         0.8420      2000
   macro avg     0.8435    0.8422    0.8419      2000
weighted avg     0.8437    0.8420    0.8419      2000



In [17]:
submissions = pd.DataFrame({'id': test['id'], 'sentiment': ["positive" if i == 1 else "negative" for i in bnb.predict(vectorized_test)]})
submissions.to_csv("submission.csv", index=False)

### K Nearest Neighbor


In [None]:
knn = KNeighborsClassifier(n_neighbors=30, n_jobs=-1, weights='distance')
knn.fit(vectorized_train, train_data['sentiment'])

In [None]:
pred_valid = knn.predict(X=vectorized_valid)
print(classification_report(valid_data['sentiment'], pred_valid, digits=4))

              precision    recall  f1-score   support

           0     0.7399    0.8107    0.7737       993
           1     0.7939    0.7190    0.7546      1007

    accuracy                         0.7645      2000
   macro avg     0.7669    0.7648    0.7641      2000
weighted avg     0.7671    0.7645    0.7640      2000



In [None]:
submissions = pd.DataFrame({'id': test['id'], 'sentiment': ["positive" if i == 1 else "negative" for i in knn.predict(vectorized_test)]})
submissions.to_csv("submission.csv", index=False)

### LogisticRegression

In [48]:
lr = LogisticRegression(max_iter=1000, n_jobs=-2)
lr.fit(vectorized_train, train_data['sentiment'])

In [49]:
pred_valid = lr.predict(X=vectorized_valid)
print(classification_report(valid_data['sentiment'], pred_valid, digits=4))

              precision    recall  f1-score   support

           0     0.8866    0.8741    0.8803       993
           1     0.8776    0.8898    0.8836      1007

    accuracy                         0.8820      2000
   macro avg     0.8821    0.8819    0.8820      2000
weighted avg     0.8821    0.8820    0.8820      2000



In [50]:
submissions = pd.DataFrame({'id': test['id'], 'sentiment': ["positive" if i == 1 else "negative" for i in lr.predict(vectorized_test)]})
submissions.to_csv("submission.csv", index=False)