In [1]:
import pandas as pd
import warnings
from langdetect import detect

warnings. simplefilter(action='ignore')

In [2]:
# read data
df = pd.read_csv("./data/train.csv")

# there are many five star reviews, which can bias the classifier, so getting rid of some
fives = df.loc[df['Score'] == 5]
fives = fives.sample(frac=0.5)
df = pd.concat([df.loc[df['Score'] != 5], fives])

In [3]:
from langdetect import detect

# get relevant fields of data
df['Text'].loc[df["Text"].isna()] = ""
df['Summary'].loc[df["Summary"].isna()] = ""

# check for different languages
# df['Language'] = df['Text'].apply(lambda x: det_lan(x))

In [4]:
# change emojis into words to parse
df["Text"] = df["Text"].replace(["\:\)", "\:\-\)", "\:\-\}", "\;\-\}", "\:\-\>", "\;\-\)"], ["Happy","Happy","Happy","Happy","Happy","Happy"], regex=True)
df["Text"] = df["Text"].replace(["\:\-\(", "\:\(", "\:\-\|", "\;\-\(", "\;\-\<", "\|\-\{"], ["Sad", "Sad", "Sad", "Sad", "Sad", "Sad",], regex=True)
df["Text"] = df["Text"].replace(["\:\D", "\:\'\-\)", "\:\`\-\(", "\>\:\(", "\>\:\-\("], ["laugh", "tear of joy", "tear of sadness", "angry", "angry"], regex=True)

# remove punctuation
df["Text"] = df["Text"].str.replace('[^\w\s]','')
df["Summary"] = df["Summary"].str.replace('[^\w\s]','')

# separate test and train data
df_train = df.loc[df['Score'].isna() == False]
df_test = df.loc[df['Score'].isna()]

In [5]:
df_train['Text'] = df_train['Text'].str.lower()
df_test['Text']  = df_test['Text'].str.lower()
df_train['Summary'] = df_train['Summary'].str.lower()
df_test['Summary']  = df_test['Summary'].str.lower()

In [6]:
df.head()

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,0,5019281,ADZPIG9QOCDG5,0,0,4.0,1203984000,good version of a classic,This is a charming version of the classic Dick...
1,1,5019281,A35947ZP82G7JH,0,0,3.0,1388361600,Good but not as moving,It was good but not as emotionally moving as t...
2,2,5019281,A3UORV8A9D5L2E,0,0,3.0,1388361600,Winklers Performance was ok at best,Dont get me wrong Winkler is a wonderful chara...
4,4,5019281,A3R27T4HADWFFJ,0,0,4.0,1387670400,Best Scrooge yet,This is one of the best Scrooge movies out He...
5,5,5019281,A2L0G56BNOTX6S,0,0,,1383696000,Dickens updated,This has been a favorite movie of mine for a l...


In [7]:
from nltk.tokenize import word_tokenize

df_train_tok = df_train.copy()
df_train_tok["Text"] = df_train["Text"].apply(word_tokenize)
df_train_tok["Summary"] = df_train["Summary"].apply(word_tokenize)

In [8]:
from nltk.corpus import stopwords

# remove stop words from tokenized lists on each dataframe entry
stop_words = list(stopwords.words('english')) #About 179 stopwords
df_train_tok["Text"] = df_train_tok["Text"].apply(lambda x: [item for item in x if item not in stop_words])
df_train_tok["Summary"] = df_train_tok["Summary"].apply(lambda x: [item for item in x if item not in stop_words])


In [None]:
from nltk.stem.snowball import SnowballStemmer

# stem tokenized words to avoid overfitting
stemmer = SnowballStemmer("english")

df_train_tok['Text'] = df_train_tok['Text'].apply(lambda x: [stemmer.stem(y) for y in x])
df_train_tok['Summary'] = df_train_tok['Summary'].apply(lambda x: [stemmer.stem(y) for y in x])

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
# count occurences of words to input that as data for KNN because it can only do computations on numbers
print('vectorizer')
count_w = HashingVectorizer()
train_counts_sum = count_w.fit_transform(df['Summary'])
train_counts_txt = count_w.fit_transform(df['Text'])

# turn occurences into frequency
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_counts_sum)
train_tfidf2 = tfidf_transformer.fit_transform(train_counts_txt)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer   

count_w = TfidfVectorizer(max_df=0.9, min_df=0.05)
str_df = df['Summary'].values + " " +  df['Text'].values
train_counts_sum = count_w.fit_transform(str_df)

In [None]:
# Load test set
submissionSet = pd.read_csv("./data/test.csv")
train_processed = df
    
# Merge on Id so that the test set can have feature columns as well
testX= pd.merge(train_processed, submissionSet, left_on='Id', right_on='Id')
testX = testX.drop(columns=['Score_x'])
testX = testX.rename(columns={'Score_y': 'Score'})

# The training set is where the score is not null
trainX =  train_processed[train_processed['Score'].notnull()]

testX.to_csv("./data/X_test.csv", index=False)
trainX.to_csv("./data/X_train.csv", index=False)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

# Load files into DataFrames
X_train = pd.read_csv("./data/X_train.csv")
X_submission = pd.read_csv("./data/X_test.csv")

# Split training set into training and testing set
X_train, X_test, Y_train, Y_test = train_test_split(
        X_train.drop(['Score'], axis=1),
        X_train['Score'],
        test_size=1/4.0,
        random_state=0
    )

# Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

# This is where you can do more feature selection
X_train_processed = X_train.drop(columns=['Id', 'ProductId', 'UserId', 'Text', 'Summary'])
X_test_processed = X_test.drop(columns=['Id', 'ProductId', 'UserId', 'Text', 'Summary'])
X_submission_processed = X_submission.drop(columns=['Id', 'ProductId', 'UserId', 'Text', 'Summary', 'Score'])

# Learn the model
model = MultinomialNB().fit(X_train_processed, Y_train)

# Predict the score using the model
Y_test_predictions = model.predict(X_test_processed)
X_submission['Score'] = model.predict(X_submission_processed)

# Evaluate your model on the testing set
print("Accuracy on testing set = ", accuracy_score(Y_test, Y_test_predictions))

# Plot a confusion matrix
cm = confusion_matrix(Y_test, Y_test_predictions, normalize='true')
sns.heatmap(cm, annot=True)
plt.title('Confusion matrix of the classifier')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# Logistic Regression

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# This is where you can do more feature selection
X_train_processed = X_train.drop(columns=['Id', 'ProductId', 'UserId', 'Text', 'Summary'])
X_test_processed = X_test.drop(columns=['Id', 'ProductId', 'UserId', 'Text', 'Summary'])
X_submission_processed = X_submission.drop(columns=['Id', 'ProductId', 'UserId', 'Text', 'Summary', 'Score'])

# Learn the model
model = LogisticRegression().fit(X_train_processed, Y_train)

# Predict the score using the model
Y_test_predictions = model.predict(X_test_processed)
X_submission['Score'] = model.predict(X_submission_processed)

# Evaluate your model on the testing set
print("Accuracy on testing set = ", accuracy_score(Y_test, Y_test_predictions))

# Plot a confusion matrix
cm = confusion_matrix(Y_test, Y_test_predictions, normalize='true')
sns.heatmap(cm, annot=True)
plt.title('Confusion matrix of the classifier')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

# This is where you can do more feature selection
X_train_processed = X_train.drop(columns=['Id', 'ProductId', 'UserId', 'Text', 'Summary'])
X_test_processed = X_test.drop(columns=['Id', 'ProductId', 'UserId', 'Text', 'Summary'])
X_submission_processed = X_submission.drop(columns=['Id', 'ProductId', 'UserId', 'Text', 'Summary', 'Score'])

# Learn the model
model = DecisionTreeClassifier().fit(X_train_processed, Y_train)

# Predict the score using the model
Y_test_predictions = model.predict(X_test_processed)
X_submission['Score'] = model.predict(X_submission_processed)

# Evaluate your model on the testing set
print("Accuracy on testing set = ", accuracy_score(Y_test, Y_test_predictions))

# Plot a confusion matrix
cm = confusion_matrix(Y_test, Y_test_predictions, normalize='true')
sns.heatmap(cm, annot=True)
plt.title('Confusion matrix of the classifier')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# Extra Trees Classifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

# This is where you can do more feature selection
X_train_processed = X_train.drop(columns=['Id', 'ProductId', 'UserId', 'Text', 'Summary'])
X_test_processed = X_test.drop(columns=['Id', 'ProductId', 'UserId', 'Text', 'Summary'])
X_submission_processed = X_submission.drop(columns=['Id', 'ProductId', 'UserId', 'Text', 'Summary', 'Score'])

# Learn the model
model = ExtraTreesClassifier().fit(X_train_processed, Y_train)

# Predict the score using the model
Y_test_predictions = model.predict(X_test_processed)
X_submission['Score'] = model.predict(X_submission_processed)

# Evaluate your model on the testing set
print("Accuracy on testing set = ", accuracy_score(Y_test, Y_test_predictions))

# Plot a confusion matrix
cm = confusion_matrix(Y_test, Y_test_predictions, normalize='true')
sns.heatmap(cm, annot=True)
plt.title('Confusion matrix of the classifier')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# SVC

In [None]:
from sklearn.svm import SVC

# This is where you can do more feature selection
X_train_processed = X_train.drop(columns=['Id', 'ProductId', 'UserId', 'Text', 'Summary'])
X_test_processed = X_test.drop(columns=['Id', 'ProductId', 'UserId', 'Text', 'Summary'])
X_submission_processed = X_submission.drop(columns=['Id', 'ProductId', 'UserId', 'Text', 'Summary', 'Score'])

# Learn the model
model = SVC().fit(X_train_processed, Y_train)

# Predict the score using the model
Y_test_predictions = model.predict(X_test_processed)
X_submission['Score'] = model.predict(X_submission_processed)

# Evaluate your model on the testing set
print("Accuracy on testing set = ", accuracy_score(Y_test, Y_test_predictions))

# Plot a confusion matrix
cm = confusion_matrix(Y_test, Y_test_predictions, normalize='true')
sns.heatmap(cm, annot=True)
plt.title('Confusion matrix of the classifier')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
# Create the submission file
submission = X_submission[['Id', 'Score']]
submission.to_csv("./data/submission.csv", index=False)