In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score, recall_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix

In [2]:
# read train and test file
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
# visualize train_data
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14545 entries, 0 to 14544
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_reputation  14545 non-null  int64 
 1   reply_count      14545 non-null  int64 
 2   thumbs_up        14545 non-null  int64 
 3   thumbs_down      14545 non-null  int64 
 4   best_score       14545 non-null  int64 
 5   text             14543 non-null  object
 6   stars            14545 non-null  int64 
dtypes: int64(6), object(1)
memory usage: 795.6+ KB


In [4]:
train_data.head()

Unnamed: 0,user_reputation,reply_count,thumbs_up,thumbs_down,best_score,text,stars
0,1,0,0,0,100,Tasty!,5
1,1,0,0,0,100,As soon as I saw this on the cover of the maga...,5
2,1,0,0,0,100,This recipe is great! I have never made bread ...,5
3,10,0,5,2,261,"@Sarah (from Dec. 16, 2019): What the recipe d...",0
4,1,0,0,0,100,This was absolutely delish! My whole family ...,5


In [5]:
train_data.describe()

Unnamed: 0,user_reputation,reply_count,thumbs_up,thumbs_down,best_score,stars
count,14545.0,14545.0,14545.0,14545.0,14545.0,14545.0
mean,2.165211,0.014163,1.102509,0.54823,153.924579,4.285115
std,9.950558,0.134494,4.177282,3.373137,142.637031,1.546951
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,100.0,5.0
50%,1.0,0.0,0.0,0.0,100.0,5.0
75%,1.0,0.0,0.0,0.0,100.0,5.0
max,520.0,3.0,80.0,122.0,946.0,5.0


In [6]:
# visualize train_data, same stpes as train_data
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3637 entries, 0 to 3636
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_reputation  3637 non-null   int64 
 1   reply_count      3637 non-null   int64 
 2   thumbs_up        3637 non-null   int64 
 3   thumbs_down      3637 non-null   int64 
 4   best_score       3637 non-null   int64 
 5   text             3637 non-null   object
 6   stars            3637 non-null   int64 
dtypes: int64(6), object(1)
memory usage: 199.0+ KB


In [7]:
test_data.head()

Unnamed: 0,user_reputation,reply_count,thumbs_up,thumbs_down,best_score,text,stars
0,1,0,0,0,100,i am on the fence with this one it was alright...,3
1,1,0,0,0,100,I just found this recipe online after losing i...,0
2,1,0,57,8,873,We have made this recipe several times and enj...,0
3,1,0,0,0,100,I made the exact recipe as is and it is wonder...,5
4,1,0,0,1,100,Have been on the hunt for the best Stuffed Pep...,5


In [8]:
test_data.describe()

Unnamed: 0,user_reputation,reply_count,thumbs_up,thumbs_down,best_score,stars
count,3637.0,3637.0,3637.0,3637.0,3637.0,3637.0
mean,2.137201,0.016497,1.036294,0.553753,150.113005,4.303547
std,10.268407,0.151096,4.294738,3.834041,134.624598,1.536222
min,0.0,0.0,0.0,0.0,4.0,0.0
25%,1.0,0.0,0.0,0.0,100.0,5.0
50%,1.0,0.0,0.0,0.0,100.0,5.0
75%,1.0,0.0,0.0,0.0,100.0,5.0
max,510.0,3.0,106.0,126.0,922.0,5.0


In [9]:
# Since a score of 0 denoting an absence of rating, we remove all the zero in stars col for train and test dataset
train_data = train_data[train_data['stars'] !=0] 
test_data = test_data[test_data['stars'] !=0] 

In [10]:
# check correlation for each numeric feactures
numeric_train_data = train_data[['user_reputation', 'reply_count', 'thumbs_up', 'thumbs_down', 'best_score', 'stars']]
numeric_train_data.corr()

Unnamed: 0,user_reputation,reply_count,thumbs_up,thumbs_down,best_score,stars
user_reputation,1.0,0.008163,0.060865,0.026532,0.146126,0.003879
reply_count,0.008163,1.0,0.18509,0.291853,0.179159,-0.113347
thumbs_up,0.060865,0.18509,1.0,0.401592,0.692462,-0.006781
thumbs_down,0.026532,0.291853,0.401592,1.0,0.230069,-0.245342
best_score,0.146126,0.179159,0.692462,0.230069,1.0,0.019012
stars,0.003879,-0.113347,-0.006781,-0.245342,0.019012,1.0


# From the correlation table, scores around above 0.75 or below -0.75 are identified as having a strong correlation. However, in this table, all the scores are below 0.75 or above -0.75, which indicates that these features have a low correlation. We must consider another approach to solve this problem.

In [11]:
# same steps for test data
test_data['text'] = test_data['text'].str.lower()
test_data['text'] = test_data['text'].str.replace('[^\w\s]', '', regex=True)
train_data['text'] = train_data['text'].str.lower()
train_data['text'] = train_data['text'].str.replace('[^\w\s]', '', regex=True)
# remove nan from text
train_data['text'] = train_data['text'].fillna('')
test_data['text'] = test_data['text'].fillna('')

In [13]:
# setup CountVectorizer
c_vectorizer = CountVectorizer(
    stop_words='english',
    dtype=float
)

# setup LogisticRegression
pipeline1 = Pipeline([
    ('cv', c_vectorizer),
    ('lr', LogisticRegression(max_iter=10000))
])

param_grid1 = {
    'cv__max_df': [0.5, 0.75, 1.0],
    'cv__ngram_range': [(1, 1), (1, 2)],
    'lr__C': [0.1, 1, 10],
    'lr__penalty': ['l2'],
    'lr__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

grid_search1 = GridSearchCV(pipeline1, param_grid1, cv=5, verbose=1, n_jobs=-1)
grid_search1.fit(train_data['text'], train_data['stars'])
best_model1 = grid_search1.best_estimator_
predictions = best_model1.predict(test_data['text'])
accuracy = accuracy_score(test_data['stars'], predictions)
precision = precision_score(test_data['stars'], predictions, average='weighted',labels=[1, 2, 3, 4, 5])
recall = recall_score(test_data['stars'], predictions, average='weighted',labels=[1, 2, 3, 4, 5])
print("Best Model Accuracy:", accuracy)
print("Best Model Precision:", precision)
print("Best Model Recall:", recall)
print("Best Model Parameters:", grid_search1.best_params_)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
Best Model Accuracy: 0.8580527752502275
Best Model Precision: 0.8146297953410058
Best Model Recall: 0.8580527752502275
Best Model Parameters: {'cv__max_df': 0.5, 'cv__ngram_range': (1, 2), 'lr__C': 1, 'lr__penalty': 'l2', 'lr__solver': 'liblinear'}


In [17]:
# Setup DecisionTreeClassifier
pipeline2 = Pipeline([
    ('cv', c_vectorizer),
    ('dt', DecisionTreeClassifier())
])

param_grid2 = {
    'cv__max_df': [0.5, 0.75, 1.0],
    'cv__ngram_range': [(1, 1), (1, 2)],
    'dt__criterion': ['gini', 'entropy'],
    'dt__max_depth': [None, 10, 20, 30],
    'dt__min_samples_split': [2, 5, 10],
    'dt__min_samples_leaf': [1, 2, 4],
    'dt__max_features': [None, 'auto', 'sqrt', 'log2']
}

grid_search2 = GridSearchCV(pipeline2, param_grid2, cv=5, verbose=1, n_jobs=-1)
grid_search2.fit(train_data['text'], train_data['stars'])
best_model2 = grid_search2.best_estimator_
predictions = best_model2.predict(test_data['text'])
accuracy = accuracy_score(test_data['stars'], predictions)
precision = precision_score(test_data['stars'], predictions, average='weighted',labels=[1, 2, 3, 4, 5])
recall = recall_score(test_data['stars'], predictions, average='weighted',labels=[1, 2, 3, 4, 5])
print("Best Model Accuracy:", accuracy)
print("Best Model Precision:", precision)
print("Best Model Recall:", recall)

Fitting 5 folds for each of 1728 candidates, totalling 8640 fits
Best Model Accuracy: 0.8437973915680922
Best Model Precision: 0.7762713244004648
Best Model Recall: 0.8437973915680922


In [18]:
# Setup RandomForestClassifier
pipeline3 = Pipeline([
    ('cv', c_vectorizer),
    ('rf', RandomForestClassifier())
])

param_grid3 = {
    'cv__max_df': [0.5, 0.75, 1.0],
    'cv__ngram_range': [(1, 1), (1, 2)],
    'rf__n_estimators': [100],
    #'rf__criterion': ['gini', 'entropy'], I have to remove few and use defult, cuz my computer will just run forever
    'rf__max_depth': [None, 10, 20, 30],
    #'rf__min_samples_split': [2, 5, 10],
    #'rf__min_samples_leaf': [1, 2, 4],
    #'rf__max_features': [None, 'auto', 'sqrt', 'log2'],
}

grid_search3 = GridSearchCV(pipeline3, param_grid3, cv=5, verbose=1, n_jobs=-1)
grid_search3.fit(train_data['text'], train_data['stars'])
best_model3 = grid_search3.best_estimator_
predictions = best_model3.predict(test_data['text'])
accuracy = accuracy_score(test_data['stars'], predictions)
precision = precision_score(test_data['stars'], predictions, average='weighted',labels=[1, 2, 3, 4, 5])
recall = recall_score(test_data['stars'], predictions, average='weighted',labels=[1, 2, 3, 4, 5])
print("Best Model Accuracy:", accuracy)
print("Best Model Precision:", precision)
print("Best Model Recall:", recall)
print("Best Model Parameters:", grid_search3.best_params_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Model Accuracy: 0.8528965726417955
Best Model Precision: 0.8249335975091524
Best Model Recall: 0.8528965726417955
Best Model Parameters: {'cv__max_df': 0.5, 'cv__ngram_range': (1, 1), 'rf__max_depth': None, 'rf__n_estimators': 100}


In [19]:
# Setup MultinomialNB
pipeline4 = Pipeline([
    ('cv', c_vectorizer),
    ('nb', MultinomialNB())
])

param_grid4 = {
    'cv__max_df': [0.5, 0.75, 1.0],
    'cv__ngram_range': [(1, 1), (1, 2)],
    'nb__alpha': [1.0, 0.1, 0.01],
}

grid_search4 = GridSearchCV(pipeline4, param_grid4, cv=5, verbose=1, n_jobs=-1)
grid_search4.fit(train_data['text'], train_data['stars'])
best_model4 = grid_search4.best_estimator_
predictions = best_model4.predict(test_data['text'])
accuracy = accuracy_score(test_data['stars'], predictions)
precision = precision_score(test_data['stars'], predictions, average='weighted',labels=[1, 2, 3, 4, 5])
recall = recall_score(test_data['stars'], predictions, average='weighted',labels=[1, 2, 3, 4, 5])
print("Best Model Accuracy:", accuracy)
print("Best Model Precision:", precision)
print("Best Model Recall:", recall)
print("Best Model Parameters:", grid_search4.best_params_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Model Accuracy: 0.8474370639975736
Best Model Precision: 0.7819332918593693
Best Model Recall: 0.8474370639975736
Best Model Parameters: {'cv__max_df': 0.5, 'cv__ngram_range': (1, 2), 'nb__alpha': 1.0}


  _warn_prf(average, modifier, msg_start, len(result))


# The best accuracy for countvectorizer is LogisticRegression and Best Model Parameters: {'cv__max_df': 0.5, 'cv__ngram_range': (1, 2), 'lr__C': 1, 'lr__penalty': 'l2', 'lr__solver': 'liblinear'}, 85.87% (drop 0), 78.28% (with 0)}

In [20]:
# setup CountVectorizer
tfidf_vectorizer = TfidfVectorizer(
    stop_words='english',
    dtype=float
)

In [21]:
# set up LogisticRegression
pipeline5 = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('lr', LogisticRegression(max_iter=10000))])

# we will use same params for grid
param_grid1 = {
    'tfidf__max_df': [0.5, 0.75, 1.0],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'lr__C': [0.1, 1, 10],
    'lr__penalty': ['l2'],
    'lr__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

grid_search5 = GridSearchCV(pipeline5, param_grid1, cv=5, verbose=1, n_jobs=-1)
grid_search5.fit(train_data['text'], train_data['stars'])
best_model5 = grid_search5.best_estimator_
predictions = best_model5.predict(test_data['text'])
accuracy = accuracy_score(test_data['stars'], predictions)
precision = precision_score(test_data['stars'], predictions, average='weighted',labels=[1, 2, 3, 4, 5])
recall = recall_score(test_data['stars'], predictions, average='weighted',labels=[1, 2, 3, 4, 5])
print("Best Model Accuracy:", accuracy)
print("Best Model Precision:", precision)
print("Best Model Recall:", recall)
print("Best Model Parameters:", grid_search5.best_params_)

Fitting 5 folds for each of 90 candidates, totalling 450 fits




Best Model Accuracy: 0.8601759175007583
Best Model Precision: 0.816844612059923
Best Model Recall: 0.8601759175007583
Best Model Parameters: {'lr__C': 10, 'lr__penalty': 'l2', 'lr__solver': 'saga', 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}


In [22]:
# setup DecisionTreeClassifier
pipeline6 = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('dt', DecisionTreeClassifier())])

# we will use same params for grid
param_grid2 = {
    'tfidf__max_df': [0.5, 0.75, 1.0],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'dt__criterion': ['gini', 'entropy'],
    'dt__max_depth': [None, 10, 20, 30],
    'dt__min_samples_split': [2, 5, 10],
    'dt__min_samples_leaf': [1, 2, 4],
    'dt__max_features': [None, 'auto', 'sqrt', 'log2']
}

grid_search6 = GridSearchCV(pipeline6, param_grid2, cv=5, verbose=1, n_jobs=-1)
grid_search6.fit(train_data['text'], train_data['stars'])
best_model6 = grid_search6.best_estimator_
predictions = best_model6.predict(test_data['text'])
accuracy = accuracy_score(test_data['stars'], predictions)
precision = precision_score(test_data['stars'], predictions, average='weighted',labels=[1, 2, 3, 4, 5])
recall = recall_score(test_data['stars'], predictions, average='weighted',labels=[1, 2, 3, 4, 5])
print("Best Model Accuracy:", accuracy)
print("Best Model Precision:", precision)
print("Best Model Recall:", recall)
print("Best Model Parameters:", grid_search6.best_params_)

Fitting 5 folds for each of 1728 candidates, totalling 8640 fits




Best Model Accuracy: 0.8444040036396724
Best Model Precision: 0.769785944424968
Best Model Recall: 0.8444040036396724
Best Model Parameters: {'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__max_features': 'sqrt', 'dt__min_samples_leaf': 4, 'dt__min_samples_split': 5, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 1)}


  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
# setup RandomForestClassifier
pipeline7 = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('rf', RandomForestClassifier())
])

param_grid3 = {
    'tfidf__max_df': [0.5, 0.75, 1.0],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'rf__n_estimators': [100],
    #'rf__criterion': ['gini', 'entropy'], I have to remove few and use defult, cuz my computer will just run forever
    'rf__max_depth': [None, 10, 20, 30],
    #'rf__min_samples_split': [2, 5, 10],
    #'rf__min_samples_leaf': [1, 2, 4],
    #'rf__max_features': [None, 'auto', 'sqrt', 'log2'],
}

grid_search7 = GridSearchCV(pipeline7, param_grid3, cv=5, verbose=1, n_jobs=-1)
grid_search7.fit(train_data['text'], train_data['stars'])
best_model7 = grid_search7.best_estimator_
predictions = best_model7.predict(test_data['text'])
accuracy = accuracy_score(test_data['stars'], predictions)
precision = precision_score(test_data['stars'], predictions, average='weighted',labels=[1, 2, 3, 4, 5])
recall = recall_score(test_data['stars'], predictions, average='weighted',labels=[1, 2, 3, 4, 5])
print("Best Model Accuracy:", accuracy)
print("Best Model Precision:", precision)
print("Best Model Recall:", recall)
print("Best Model Parameters:", grid_search7.best_params_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits




Best Model Accuracy: 0.851380042462845
Best Model Precision: 0.8340121079301349
Best Model Recall: 0.851380042462845
Best Model Parameters: {'rf__max_depth': None, 'rf__n_estimators': 100, 'tfidf__max_df': 0.75, 'tfidf__ngram_range': (1, 1)}


In [24]:
# Setup MultinomialNB
pipeline8 = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('nb', MultinomialNB())
])

param_grid4 = {
    'tfidf__max_df': [0.5, 0.75, 1.0],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'nb__alpha': [1.0, 0.1, 0.01],
}

grid_search8 = GridSearchCV(pipeline8, param_grid4, cv=5, verbose=1, n_jobs=-1)
grid_search8.fit(train_data['text'], train_data['stars'])
best_model8 = grid_search8.best_estimator_
predictions = best_model8.predict(test_data['text'])
accuracy = accuracy_score(test_data['stars'], predictions)
precision = precision_score(test_data['stars'], predictions, average='weighted',labels=[1, 2, 3, 4, 5])
recall = recall_score(test_data['stars'], predictions, average='weighted',labels=[1, 2, 3, 4, 5])
print("Best Model Accuracy:", accuracy)
print("Best Model Precision:", precision)
print("Best Model Recall:", recall)
print("Best Model Parameters:", grid_search8.best_params_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits




Best Model Accuracy: 0.8535031847133758
Best Model Precision: 0.8122159526777399
Best Model Recall: 0.8535031847133758
Best Model Parameters: {'nb__alpha': 0.01, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}


# The best accuracy for TF-IDF is LogisticRegression and Best Model Parameters: {'lr__C': 10, 'lr__penalty': 'l2', 'lr__solver': 'liblinear', 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}, 85.90% (dropped o) or 78.66% (with 0)

In [26]:
# Extract Basic Text Features using pandas, for LogisticRegression.
train_data['text_length'] = train_data['text'].fillna('').apply(len)
test_data['text_length'] = test_data['text'].fillna('').apply(len)
y_train = train_data['stars']
y_test = test_data['stars']
X_train_pd = train_data[['text_length']]
X_test_pd = test_data[['text_length']]
lr_model = LogisticRegression()
lr_model.fit(X_train_pd, y_train)
predictions = lr_model.predict(X_test_pd)
accuracy = accuracy_score(y_test, predictions)
print(accuracy)
precision = precision_score(test_data['stars'], predictions, average='weighted',labels=[1, 2, 3, 4, 5])
recall = recall_score(test_data['stars'], predictions, average='weighted',labels=[1, 2, 3, 4, 5])
print("Best Model Accuracy:", accuracy)
print("Best Model Precision:", precision)
print("Best Model Recall:", recall)

0.8456172277828329
Best Model Accuracy: 0.8456172277828329
Best Model Precision: 0.7150684959231235
Best Model Recall: 0.8456172277828329


  _warn_prf(average, modifier, msg_start, len(result))


# The accuracy for the model using length for basic text information and Logistic Regression is 84.56%(dropped 0) and 76.66% (with 0). 

In [27]:
# save the result predict into csv use the best model: TF-IDF x LogisticRegression
vectorizer = TfidfVectorizer(
    ngram_range = (1, 2),
    max_df = 0.5
)
X_train_tfidf = vectorizer.fit_transform(train_data['text'])
X_test_tfidf = vectorizer.transform(test_data['text'])
y_train = train_data['stars'] 
y_test = test_data['stars']
model = LogisticRegression(max_iter=1000, penalty = 'l2', C = 10, solver = 'liblinear')
model.fit(X_train_tfidf, y_train)
predictions = model.predict(X_test_tfidf)
with open('accuracy.csv', 'w') as f:
    f.write(str(predictions))
accuracy = accuracy_score(y_test, predictions)
with open('accuracy.csv', 'w') as f:
    f.write(str(accuracy))

# Use transformers for advance improvements.

In [28]:
# refrence https://huggingface.co/docs/transformers/en/preprocessing, it took too much time for trainng. I keep the code but won't use it 
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm

  torch.utils._pytree._register_pytree_node(


In [29]:
'''
# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
'''

"\n# Load data\ntrain_data = pd.read_csv('train.csv')\ntest_data = pd.read_csv('test.csv')\n"

In [30]:
'''
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
'''

'\n# Initialize tokenizer\ntokenizer = AutoTokenizer.from_pretrained("bert-base-cased")\n'

In [31]:
'''
# Preprocess and tokenize the text
def preprocess_and_tokenize(data):
    data['text'] = data['text'].str.lower().str.replace('[^\w\s]', '', regex=True).fillna('')
    return tokenizer(data['text'].tolist(), truncation=True, padding=True, return_tensors="pt")
'''

'\n# Preprocess and tokenize the text\ndef preprocess_and_tokenize(data):\n    data[\'text\'] = data[\'text\'].str.lower().str.replace(\'[^\\w\\s]\', \'\', regex=True).fillna(\'\')\n    return tokenizer(data[\'text\'].tolist(), truncation=True, padding=True, return_tensors="pt")\n'

In [32]:
'''
train_encodings = preprocess_and_tokenize(train_data)
test_encodings = preprocess_and_tokenize(test_data)
'''

'\ntrain_encodings = preprocess_and_tokenize(train_data)\ntest_encodings = preprocess_and_tokenize(test_data)\n'

In [33]:
'''
# Creating datasets
train_labels = torch.tensor(train_data['stars'].values)
test_labels = torch.tensor(test_data['stars'].values)

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

# Creating data loaders
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2)
'''

"\n# Creating datasets\ntrain_labels = torch.tensor(train_data['stars'].values)\ntest_labels = torch.tensor(test_data['stars'].values)\n\ntrain_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)\ntest_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)\n\n# Creating data loaders\ntrain_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)\ntest_loader = DataLoader(test_dataset, batch_size=2)\n"

In [34]:
'''
# Load and prepare the model
model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=6)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
'''

'\n# Load and prepare the model\nmodel = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=6)\ndevice = torch.device("cuda" if torch.cuda.is_available() else "cpu")\nmodel.to(device)\n'

In [35]:
'''
# Initialize optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 150
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)
'''

'\n# Initialize optimizer and scheduler\noptimizer = AdamW(model.parameters(), lr=5e-5)\nepochs = 150\nscheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * epochs)\n'

In [36]:
'''
model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()  
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Average Loss: {avg_loss}")
    model.eval()
    '''

'\nmodel.train()\nfor epoch in range(epochs):\n    total_loss = 0\n    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):\n        batch = tuple(b.to(device) for b in batch)\n        inputs = {\n            \'input_ids\': batch[0],\n            \'attention_mask\': batch[1],\n            \'labels\': batch[2]\n        }\n        optimizer.zero_grad()\n        outputs = model(**inputs)\n        loss = outputs.loss\n        loss.backward()\n        optimizer.step()\n        scheduler.step()  \n        total_loss += loss.item()\n\n    avg_loss = total_loss / len(train_loader)\n    print(f"Epoch {epoch+1}, Average Loss: {avg_loss}")\n    model.eval()\n    '

In [37]:
'''
predictions, true_labels = [], []

for batch in test_loader:
    batch = tuple(b.to(device) for b in batch)
    inputs = {
        'input_ids': batch[0],
        'attention_mask': batch[1]
    }
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    predictions.append(logits.argmax(dim=-1).cpu().numpy())
    true_labels.append(batch[2].cpu().numpy())
   ''' 

"\npredictions, true_labels = [], []\n\nfor batch in test_loader:\n    batch = tuple(b.to(device) for b in batch)\n    inputs = {\n        'input_ids': batch[0],\n        'attention_mask': batch[1]\n    }\n    with torch.no_grad():\n        outputs = model(**inputs)\n    \n    logits = outputs.logits\n    predictions.append(logits.argmax(dim=-1).cpu().numpy())\n    true_labels.append(batch[2].cpu().numpy())\n   "

In [38]:
'''
# Flatten lists
predictions = np.concatenate(predictions)
true_labels = np.concatenate(true_labels)

accuracy = accuracy_score(true_labels, predictions)
print("Accuracy:", accuracy)
'''

'\n# Flatten lists\npredictions = np.concatenate(predictions)\ntrue_labels = np.concatenate(true_labels)\n\naccuracy = accuracy_score(true_labels, predictions)\nprint(f"Accuracy: {accuracy:.4f}")\n'

# Use Voting Classifier for advance improvements.

In [39]:
'''
from sklearn.ensemble import VotingClassifier

pipe_lr = Pipeline([('vect', vectorizer), ('clf', LogisticRegression(max_iter=10000))])
pipe_dt = Pipeline([('vect', vectorizer), ('clf', DecisionTreeClassifier())])
pipe_rf = Pipeline([('vect', vectorizer), ('clf', RandomForestClassifier())])
pipe_nb = Pipeline([('vect', vectorizer), ('clf', MultinomialNB())])

# setup voting classifier
voting_clf = VotingClassifier(estimators=[
    ('lr', pipe_lr), 
    ('dt', pipe_dt), 
    ('rf', pipe_rf), 
    ('nb', pipe_nb)
], voting='hard')
voting_clf.fit(X_train, y_train)
predictions = voting_clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
accuracy
predictions
'''

"\nfrom sklearn.ensemble import VotingClassifier\n\npipe_lr = Pipeline([('vect', vectorizer), ('clf', LogisticRegression(max_iter=10000))])\npipe_dt = Pipeline([('vect', vectorizer), ('clf', DecisionTreeClassifier())])\npipe_rf = Pipeline([('vect', vectorizer), ('clf', RandomForestClassifier())])\npipe_nb = Pipeline([('vect', vectorizer), ('clf', MultinomialNB())])\n\n# setup voting classifier\nvoting_clf = VotingClassifier(estimators=[\n    ('lr', pipe_lr), \n    ('dt', pipe_dt), \n    ('rf', pipe_rf), \n    ('nb', pipe_nb)\n], voting='hard')\nvoting_clf.fit(X_train, y_train)\npredictions = voting_clf.predict(X_test)\naccuracy = accuracy_score(y_test, predictions)\naccuracy\npredictions\n"

# Q1 Using TF-IDF x (logistic regression,DecisionTree Classifier,RandomForest Classifier,MultinomialNB) for classification_report 

In [42]:
# logistic regression
vectorizer = TfidfVectorizer(
    ngram_range = (1, 2),
    max_df = 0.5
)
X_train_tfidf = vectorizer.fit_transform(train_data['text'])
X_test_tfidf = vectorizer.transform(test_data['text'])
y_train = train_data['stars'] 
y_test = test_data['stars']
model = LogisticRegression(max_iter=1000, penalty = 'l2', C = 10, solver = 'liblinear')
model.fit(X_train_tfidf, y_train)
predictions = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, predictions)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           1       0.67      0.17      0.28        46
           2       0.38      0.07      0.12        44
           3       0.50      0.15      0.23        98
           4       0.49      0.18      0.27       321
           5       0.88      0.99      0.93      2788

    accuracy                           0.86      3297
   macro avg       0.58      0.31      0.37      3297
weighted avg       0.82      0.86      0.83      3297



In [46]:
# DecisionTree Classifier
model = DecisionTreeClassifier(criterion='entropy', max_depth=10, max_features='sqrt', min_samples_leaf = 4, min_samples_split= 5)
model.fit(X_train_tfidf, y_train)
predictions = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, predictions)
print(classification_report(y_test,predictions, zero_division=1))

              precision    recall  f1-score   support

           1       1.00      0.00      0.00        46
           2       0.00      0.00      0.00        44
           3       0.00      0.00      0.00        98
           4       0.00      0.00      0.00       321
           5       0.85      1.00      0.92      2788

    accuracy                           0.84      3297
   macro avg       0.37      0.20      0.18      3297
weighted avg       0.73      0.84      0.77      3297



In [47]:
# RandomForest Classifier
model = RandomForestClassifier(max_depth = None, n_estimators=100)
model.fit(X_train_tfidf, y_train)
predictions = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, predictions)
print(classification_report(y_test,predictions, zero_division=1))

              precision    recall  f1-score   support

           1       0.67      0.04      0.08        46
           2       1.00      0.02      0.04        44
           3       0.57      0.04      0.08        98
           4       0.86      0.04      0.07       321
           5       0.85      1.00      0.92      2788

    accuracy                           0.85      3297
   macro avg       0.79      0.23      0.24      3297
weighted avg       0.84      0.85      0.79      3297



In [48]:
# MultinomialNB
model = MultinomialNB(alpha=0.01)
model.fit(X_train_tfidf, y_train)
predictions = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, predictions)
print(classification_report(y_test,predictions, zero_division=1))

              precision    recall  f1-score   support

           1       0.80      0.26      0.39        46
           2       1.00      0.02      0.04        44
           3       0.45      0.10      0.17        98
           4       0.48      0.12      0.20       321
           5       0.87      0.99      0.93      2788

    accuracy                           0.86      3297
   macro avg       0.72      0.30      0.35      3297
weighted avg       0.82      0.86      0.82      3297



# How does the performance of your model vary across different classes? Analyze and discuss your observations regarding the precision and recall metrics for each class.

#### Across all models, there's a clear trend: performance on classes 1 through 4 is substantially lower than on class 5. This discrepancy is most likely due to class imbalance, with class 5 significantly dominating the dataset. The high performance on class 5 skews the overall accuracy, making the models appear more effective than they actually are for the minority classes. For Logistic Regression, the precision and recall for class 5 are at a high level; however, classes 1-4 exhibit much lower scores (a similar problem is observed with the other three models). Both the Decision Tree and Random Forest classifiers are affected by class imbalance and exhibit bias issues.

# Considering your analysis, how would you recommend using this model in a real- world application? Discuss any limitations or considerations that should be taken into account.

#### For real-world applications, understanding the content during preprocessing is crucial before developing any model. We need to be mindful of the model's performance in special contexts, such as detecting toxic comments (where recall is important) or email spam detection (where precision is prioritized). Continuous evaluation and updating of our datasets and models, informed by a user feedback loop and performance monitoring, are essential for maintaining relevance and effectiveness. 

#### 
Among the limitations or considerations to be mindful of, bias and fairness stand out. Our models may inadvertently reflect or amplify biases present in the training data. Techniques for bias mitigation and fairness-aware modeling may be necessary to address these concerns. While Logistic Regression models are more interpretable than some complex models, they may still not provide sufficient insight into the reasons behind specific predictions, especially for non-technical users

#### 

Furthermore, the CountVectorizer approach may struggle with new, unseen words, potentially affecting the model's performance on new data. Exploring techniques like word embeddings or transfer learning models, which can generalize better from known to unknown words, can offer improvem as using different embeddingsa (in here I switch to TF-IDF).rea.

# Analyze your data to address the previously identified accuracy issues. Describe your method to address this issue, implement it in code and retrain a classifier, and assess any improvements or ongoing challenges. Your evaluation will be based on your method's appropriateness, not the results.

####

#### In this section, I explored two methods for improvements: switching to a transformer model and using a Voting Classifier. The training process for the transformer was halted due to excessive time consumption and suboptimal GPU performance. Additionally, I implemented a Voting Classifier. Typically, this approach allows for the combination of multiple models, which can compensate for each model's weaknesses. For instance, some methods might underperform for a particular class, while others excel. By assigning different weights to each model, we can optimize performance given the current dataset constraints.