In [56]:
path = "/content/drive/MyDrive/cs688project2/data/"

In [57]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt
import seaborn as sns
import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_files


In [58]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

#Import required packages form NLTK
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Instantiate wordnet lemmatizer object
lemmatizer = WordNetLemmatizer()

# function to clean text
def clean_text(text):
    # convert text to lowercase
    text = text.lower()

    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # tokenization
    words = word_tokenize(text)

    # join words to a string
    text = ' '.join(words)

    return text



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [59]:
df = pd.read_csv(path+'new_merged.csv')
df_train = df[:8500]
df_val = df[8500:9540]


In [60]:
tf_transfomer = TfidfVectorizer(stop_words='english',decode_error='ignore')

text_total = []
for i in df['text']:
  text_total.append(i)

X_total = tf_transfomer.fit_transform(text_total)
X_train = X_total[:8500]
X_test = X_total[8500:9540]
scaler = StandardScaler(with_mean=False)
scaler.fit(X_train)
X_train =scaler.transform(X_train)
X_test = scaler.transform(X_test)
y_train = df_train['label']
y_test = df_val['label']

print('Shape of training', X_train.shape, y_train.shape)
print('Shape of test', X_test.shape, y_test.shape)

Shape of training (8500, 9538) (8500,)
Shape of test (1040, 9538) (1040,)


In [62]:
#Logistic Regression - Find best set of parameters
#Use K-fold(10 split) cross validation to find best set of parameters
num_folds = 10
seed = 42
scoring = 'accuracy'

param_grid = {
    'C': [0.1,0.2,0.3,0.5,0.8,1,5,13,15],
    'solver': ['newton-cg','lbfgs','liblinear']
}

model= LogisticRegression(max_iter = 10000)

kfold = KFold(n_splits=num_folds,random_state=seed,shuffle = True)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=X_train,y=y_train)

print('Best Accuracy: %s Using %s' % (grid_result.best_score_, grid_result.best_params_))

Best Accuracy: 0.6328235294117647 Using {'C': 0.1, 'solver': 'lbfgs'}


In [65]:
#Fit and Predict Logistic Regression model with final parameters
model_log = LogisticRegression(C=1, solver='lbfgs')
model_log.fit(X=X_train,y=y_train)
y_pred = model_log.predict(X_test)
print('Accuracy: ',accuracy_score(y_test, y_pred))
print('AUC Score: ',roc_auc_score(y_test,y_pred))

Accuracy:  0.5278846153846154
AUC Score:  0.5018888086207934


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [66]:
#Naive Bayes
param_grid = {}
param_grid['alpha'] = [0.001,0.01,0.05,0.1,0.5,1,1.5,2]
model = MultinomialNB()
kfold = KFold(n_splits=num_folds,random_state=seed,shuffle = True)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=X_train,y=y_train)
print('Best : %s Using %s' % (grid_result.best_score_, grid_result.best_params_))


Best : 0.5497647058823529 Using {'alpha': 0.001}


In [67]:
#Fit and Predict Naive Bayes model with final parameters
model_nb = MultinomialNB(alpha=0.001)
model_nb.fit(X=X_train,y=y_train)
y_pred = model_nb.predict(X_test)
print('Accuracy: ',accuracy_score(y_test, y_pred))
print('AUC Score: ',roc_auc_score(y_test,y_pred))

Accuracy:  0.5413461538461538
AUC Score:  0.5301682619950636


In [68]:
param_grid = {}
param_grid['n_estimators'] = [10,50,100,150,200]
param_grid['max_features'] = ['sqrt','log2']
param_grid['max_depth'] = [3,4,5,6,7,8,9,10]
model = RandomForestClassifier()
kfold = KFold(n_splits=num_folds,random_state=seed,shuffle = True)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=X_train,y=y_train)
print('Best : %s Using %s' % (grid_result.best_score_, grid_result.best_params_))


Best : 0.6494117647058824 Using {'max_depth': 9, 'max_features': 'sqrt', 'n_estimators': 10}


In [69]:
#Fit and Predict Random Forest model with final parameters
model_rf = RandomForestClassifier(max_depth=9, max_features='sqrt', n_estimators=10)
model_rf.fit(X=X_train,y=y_train)
y_pred = model_rf.predict(X_test)
print('Accuracy: ',accuracy_score(y_test, y_pred))
print('AUC Score: ',roc_auc_score(y_test,y_pred))

Accuracy:  0.5682692307692307
AUC Score:  0.5040861477334296


In [70]:
#SVM
param_grid = {'C':[0.1,0.2,0.3,0.5,0.8,1,5,13,15,20]}
model = SVC(max_iter =100000)
kfold = KFold(n_splits=num_folds,random_state=seed,shuffle = True)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=X_train,y=y_train)
print('Best : %s Using %s' % (grid_result.best_score_, grid_result.best_params_))


Best : 0.6601176470588236 Using {'C': 0.8}


In [71]:
#Fit and Predict SVM model with final parameters
model_svc = SVC(C=0.5)
model_svc.fit(X=X_train,y=y_train)
y_pred = model_svc.predict(X_test)
print('Accuracy: ',accuracy_score(y_test, y_pred))
print('AUC Score: ',roc_auc_score(y_test,y_pred))

Accuracy:  0.5673076923076923
AUC Score:  0.502468243934742


In [73]:
#KNN
param_grid = {}
param_grid['n_neighbors'] = [3,5,10,20,30]
param_grid['weights'] = ['uniform','distance']
model = KNeighborsClassifier()
kfold = KFold(n_splits=num_folds,random_state=seed,shuffle = True)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=X_train,y=y_train)
print('Best : %s Using %s' % (grid_result.best_score_, grid_result.best_params_))

Best : 0.6494117647058824 Using {'n_neighbors': 5, 'weights': 'uniform'}


In [76]:
model_knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')
model_knn.fit(X=X_train,y=y_train)
y_pred = model_knn.predict(X_test)
print('Accuracy: ',accuracy_score(y_test, y_pred))
print('AUC Score: ',roc_auc_score(y_test,y_pred))

Accuracy:  0.575
AUC Score:  0.5169466016495093
