In [22]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.svm import SVC
import os
os.chdir('/content/drive/My Drive/Projects/Sentiment Analysis')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
sample = pd.read_csv('/content/drive/My Drive/Projects/Sentiment Analysis/sample_submission_LnhVWA4.csv')
test = pd.read_csv('/content/drive/My Drive/Projects/Sentiment Analysis/test_oJQbWVk.csv')
train = pd.read_csv('/content/drive/My Drive/Projects/Sentiment Analysis/train_2kmZucJ.csv')

In [3]:
def summary(data):
  print('Shape Of Data :{}'.format(data.shape))
  print(pd.DataFrame({'Dtype':data.dtypes,
                       'NAs':data.isnull().sum(),
                       'Uniques':data.nunique()}),'\n')
def main():
  print('Train Summary:')
  summary(train)
  print('Test Summary:')
  summary(test)
  print('Sample Summary:')
  summary(sample)

main()  

Train Summary:
Shape Of Data :(7920, 3)
        Dtype  NAs  Uniques
id      int64    0     7920
label   int64    0        2
tweet  object    0     7918 

Test Summary:
Shape Of Data :(1953, 2)
        Dtype  NAs  Uniques
id      int64    0     1953
tweet  object    0     1953 

Sample Summary:
Shape Of Data :(1953, 2)
       Dtype  NAs  Uniques
id     int64    0     1953
label  int64    0        1 



In [4]:
train.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [0]:
def PreProcessData(data):
  ResultDF=data.copy()
  Lemmatizer = WordNetLemmatizer()
  StopWords =stopwords.words('english')
  ResultDF['tweet'] = ResultDF['tweet'].apply(lambda x:'  '.join(Lemmatizer.lemmatize(i) for i in re.sub('[^a-zA-Z]',' ',x).split() if (i not in StopWords)&(len(i)>3)))
  ResultDF.drop('id',axis=1,inplace=True)
  return ResultDF

In [0]:
Preptrain = PreProcessData(train)
Preptest = PreProcessData(test)

In [7]:
Preptrain.head()

Unnamed: 0,label,tweet
0,0,fingerprint Pregnancy Test http MfQV andr...
1,0,Finally transparant silicon case Thanks u...
2,0,love Would talk makememories unplug relax...
3,0,wired know George made iphone cute daven...
4,1,What amazing service Apple even talk que...


In [0]:
xtrain,xval,ytrain,yval = train_test_split(Preptrain.drop('label',axis=1),Preptrain['label'])

In [0]:
tfidfVec = TfidfVectorizer( max_df=0.8, max_features=10000,ngram_range=(1,2)) 
xtrain_tfidf = tfidfVec.fit_transform(xtrain.values.flatten())
xval_tfidf =tfidfVec.transform(xval.values.flatten())
test_tfidf = tfidfVec.transform(Preptest.values.flatten())


In [0]:
def EvaluateModel(Model):
  val_pred = Model.predict(xval_tfidf)
  train_pred = Model.predict(xtrain_tfidf)
  TrainAcc = accuracy_score(ytrain,train_pred)
  ValAcc = accuracy_score(yval,val_pred)
  print('Train Accuracy :{:.2f} , Valid Accuracy :{:.2f}'.format(TrainAcc,ValAcc))
  FScore = f1_score(yval, val_pred, average="weighted")
  Precision=precision_score(yval, val_pred, average="weighted")
  Recall = recall_score(yval, val_pred, average="weighted")
  print('Weighted Average Score:')
  print('Precision :{:.2f} , Recall :{:.2f} , F1Score :{:.2f} '.format(Precision, Recall, FScore))
  return

In [86]:
lr = LogisticRegression(max_iter=3000 )
params = {'C':[0.01,1,10],
       'penalty':['l1','l2'] , 
       'class_weight':[None,'balanced'],
       }
GridCV_lr = GridSearchCV(lr , param_grid = params , scoring='f1_weighted' , n_jobs=-1)
GridCV_lr.fit(xtrain_tfidf, ytrain)
print(GridCV_lr.best_params_)
EvaluateModel(GridCV_lr)

{'C': 1, 'class_weight': 'balanced', 'penalty': 'l2'}
Train Accuracy :0.92 , Valid Accuracy :0.89
Weighted Average Score:
Precision :0.91 , Recall :0.89 , F1Score :0.89 


In [87]:
rf = RandomForestClassifier(class_weight='balanced' )
params = {'max_depth':[5,10] }
GridCV_rf = GridSearchCV(rf , param_grid = params , scoring='f1_weighted' , n_jobs=-1)
GridCV_rf.fit(xtrain_tfidf, ytrain)
print(GridCV_rf.best_params_,'\n')
EvaluateModel(GridCV_rf)

{'max_depth': 10} 

Train Accuracy :0.85 , Valid Accuracy :0.84
Weighted Average Score:
Precision :0.88 , Recall :0.84 , F1Score :0.85 


In [93]:
xgb = XGBClassifier( )
params = { 'max_depth':[-1,5,10],
          'learning_rate':[0.01,0.1,1,0.001]}
GridCV_xgb = GridSearchCV(xgb , param_grid = params , scoring='f1_weighted' , n_jobs=-1)
GridCV_xgb.fit(xtrain_tfidf, ytrain)
EvaluateModel(GridCV_xgb)

Train Accuracy :0.90 , Valid Accuracy :0.88
Weighted Average Score:
Precision :0.89 , Recall :0.88 , F1Score :0.88 


In [11]:
lg =LGBMClassifier( )
params = { 'max_depth':[-1,5,10],
          'learning_rate':[0.01,0.1,1,0.001]}
GridCV_lgb = GridSearchCV(lg , param_grid = params , scoring='f1_weighted' , n_jobs=-1)
GridCV_lgb.fit(xtrain_tfidf, ytrain)
EvaluateModel(GridCV_lgb)

Train Accuracy :0.92 , Valid Accuracy :0.89
Weighted Average Score:
Precision :0.89 , Recall :0.89 , F1Score :0.89 


In [24]:
svc = SVC(max_iter=3000, class_weight = 'balanced' )
params = {'C':[0.001,0.01,1] 
       }
GridCV_svc = GridSearchCV(svc , param_grid = params , scoring='f1_weighted' , n_jobs=-1)
GridCV_svc.fit(xtrain_tfidf, ytrain)
print(GridCV_svc.best_params_,'\n')
EvaluateModel(GridCV_svc)



{'C': 1} 

Train Accuracy :0.97 , Valid Accuracy :0.90
Weighted Average Score:
Precision :0.90 , Recall :0.90 , F1Score :0.90 


In [25]:
test_preds = GridCV_svc.predict(test_tfidf)
test_preds

array([1, 0, 1, ..., 1, 1, 0])

In [0]:
sample['label'] = test_preds
sample.to_csv('Mysub1.csv',index=False)