### **Mount Google Drive**

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### **Importing Model Buliding Libraries**

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split as tts
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

### **Reading TF-IDF Dataset of Train and Test**

In [0]:
train = pd.read_csv('/content/drive/My Drive/Sentiment Analysis hackathon/train.csv')
test = pd.read_csv('/content/drive/My Drive/Sentiment Analysis hackathon/test.csv')
train_tfidf = pd.read_pickle('/content/drive/My Drive/Sentiment Analysis hackathon/train_tfidf.pkl')
test_tfidf = pd.read_pickle('/content/drive/My Drive/Sentiment Analysis hackathon/test_tfidf.pkl')

In [0]:
print('Train Dataset:')
train_tfidf.sample(4)

Train Dataset:


Unnamed: 0,aacaiuauaaouart,aapl,aaron,aarpbulletin,abacus,abandoned,abba,abc,aber,ability,able,abnormal,abound,aboutto,abroad,absolute,absolutely,absolutley,abt,abuzz,academy,acc,accelerater,acceleration,accelerator,accept,acceptable,access,accessibility,accessible,accessory,accesssxsw,accidentally,accommodate,accompanied,accomplish,according,accordion,account,accuracy,...,yrsday,yrt,yum,yummy,yup,yur,yyz,zaarly,zaarlyiscoming,zagg,zaggle,zap,zappos,zazzle,zazzlesxsw,zazzlsxsw,zbowling,zeiger,zeitgeist,zelda,zeldman,zen,zero,zeus,zimride,zing,zinio,zip,zite,zlf,zms,zomb,zombie,zomg,zone,zoom,zuckerberg,zynga,zzzs,senitment
1408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
6158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
4400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


In [0]:
print('Test Dataset:')
test_tfidf.sample(4)

Test Dataset:


Unnamed: 0,aacaiuauaaouart,aapl,aaron,aarpbulletin,abacus,abandoned,abba,abc,aber,ability,able,abnormal,abound,aboutto,abroad,absolute,absolutely,absolutley,abt,abuzz,academy,acc,accelerater,acceleration,accelerator,accept,acceptable,access,accessibility,accessible,accessory,accesssxsw,accidentally,accommodate,accompanied,accomplish,according,accordion,account,accuracy,...,yowza,yrsday,yrt,yum,yummy,yup,yur,yyz,zaarly,zaarlyiscoming,zagg,zaggle,zap,zappos,zazzle,zazzlesxsw,zazzlsxsw,zbowling,zeiger,zeitgeist,zelda,zeldman,zen,zero,zeus,zimride,zing,zinio,zip,zite,zlf,zms,zomb,zombie,zomg,zone,zoom,zuckerberg,zynga,zzzs
1089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1181,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### **Train-Validation split from Training Dataset**

In [0]:
X = train_tfidf.drop(['senitment'],1)
y = train_tfidf['senitment']

In [0]:
X_train, X_test, v_train, v_test = tts(X,y,test_size=0.25,random_state=1)

### **Model Preparation**

 ***Classifier without Hyper-Parameter tuning***

In [0]:
def run_model(X,y,models):
  X_train, X_test, v_train, v_test = tts(X,y,test_size=0.25,random_state=1)
  for model in models:
    _model = model()
    _model.fit(X_train,v_train)
    y_pred = _model.predict(X_test)
    print(f"Model Name: {model.__name__}")
    print(f"Classification Report:")
    print(f"{classification_report(v_test, y_pred)}")
    print(f"F1-Score: {f1_score(v_test, y_pred, average='weighted')}")
    print("**"*20)



In [0]:
run_model(X,y,[MultinomialNB, GaussianNB, RandomForestClassifier, LogisticRegression, SVC ])

Model Name: MultinomialNB
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       111
           1       0.66      0.96      0.78      1121
           2       0.70      0.23      0.35       557
           3       0.00      0.00      0.00        30

    accuracy                           0.67      1819
   macro avg       0.34      0.30      0.28      1819
weighted avg       0.62      0.67      0.59      1819

F1-Score: 0.5900326977476591
****************************************
Model Name: GaussianNB
Classification Report:
              precision    recall  f1-score   support

           0       0.14      0.36      0.20       111
           1       0.73      0.42      0.53      1121
           2       0.43      0.61      0.51       557
           3       0.02      0.07      0.03        30

    accuracy                           0.47      1819
   macro avg       0.33      0.36      0.32      1819
weighted avg       0

***Classifier with Hyper-Parameter tuning***

In [0]:
def run_cv_model(X,y,model,param,cv=10):
  X_train, X_test, v_train, v_test = tts(X,y,test_size=0.25,random_state=1)
  _model = model()
  print(f"Model Name: {model.__name__}")
  _model = GridSearchCV(_model, param_grid= param, cv=10)
  _model.fit(X_train,v_train)
  y_pred = _model.predict(X_test)
  print(f"Classification Report:")
  print(f"{classification_report(v_test, y_pred)}")
  print(f"F1-Score: {f1_score(v_test, y_pred, average='weighted')}")
  print(f'Best Estimator: {_model.best_estimator_}')
  print("**"*20)
  return _model


*Logistic Regression*

In [0]:
param = {
    'C': [0.001,0.01,0.1,1,10,100],
    'penalty': ['l1','l2']
}
model = run_cv_model(X,y,LogisticRegression,param,cv=10)

Model Name: LogisticRegression
Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.19      0.28       111
           1       0.73      0.85      0.79      1121
           2       0.63      0.53      0.57       557
           3       0.00      0.00      0.00        30

    accuracy                           0.70      1819
   macro avg       0.47      0.39      0.41      1819
weighted avg       0.67      0.70      0.68      1819

F1-Score: 0.6770120196049574
Best Estimator: LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
****************************************


In [0]:
final_predict=model.predict(test_tfidf)
submit_id=list(test["tweet_id"])
submit_pred=list(final_predict)
final_submission =pd.DataFrame(list(zip(submit_id, submit_pred)), columns =['tweet_id', 'sentiment'])
final_submission.to_csv("/content/drive/My Drive/Sentiment Analysis hackathon/submission_1.csv",index=False)

*RandomForest Classifier*

In [0]:
# param = {
#     'n_estimators' : [100,200,250,300],
#     'max_depth' : [4,8,12,15],
#     # 'max_features' : np.arange(4,8,1),
#     'criterion': ['gini','entropy'],
#     # 'min_samples_split': np.arange(0.01,0.05,0.01)
# }
# model = run_cv_model(X,y,RandomForestClassifier,param,cv=10)

*Support Vector Machine*

In [0]:
# param = {'C': [0.1, 100, 1000],  
#               'gamma': [1, 0.1, 0.0001], 
#               'kernel': ['rbf']}  
# model = run_cv_model(X,y,SVC,param,cv=10)