In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.colors import ListedColormap

import nltk
import preprocessor as p
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold
from sklearn import metrics

from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.metrics import classification_report

In [15]:
from nltk.stem import PorterStemmer
porter_stemmer=PorterStemmer()

## Import Train/Test


In [2]:
train = pd.read_csv('./input/climate-change-edsa2020-21/train.csv')
test = pd.read_csv('./input/climate-change-edsa2020-21/test.csv')

## Pre-Processing


In [28]:
def preprocess_data(text):
    text=text.lower() 
    text=re.sub("\\W"," ",text) # remove special chars
    text=re.sub("\\s+(in|the|all|for|and|on)\\s+"," _connector_ ",text) # normalize certain words
    
    # stem words
    words=re.split("\\s+",text)
    stemmed_words=[porter_stemmer.stem(word=word) for word in words]
    return ' '.join(stemmed_words)
    
    
    
def cust_token(text):
    # create a space between special characters 
    text=re.sub("(\\W)"," \\1 ",text)

    # split based on whitespace
    return re.split("\\s+",text)

In [29]:
y = train['sentiment']
X = train['message']

## Vectorize Using simple Pre-Process and very simple tokenization



In [30]:
vectorizer = TfidfVectorizer(ngram_range=(1,2),tokenizer=cust_token,analyzer='word',stop_words="english",max_df=0.85, preprocessor=preprocess_data)
X_vectorized = vectorizer.fit_transform(X)



In [31]:
X_vectorized

<15819x126612 sparse matrix of type '<class 'numpy.float64'>'
	with 464963 stored elements in Compressed Sparse Row format>

In [32]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=0.30,shuffle=True, random_state=25)

## Quik Submission on LinearSVC - Used GridSearchCV top optimize 2 params

In [45]:
lsvc = LinearSVC()
lsvc.fit(X_train, y_train)
lsvc_pred = lsvc.predict(X_val)

In [58]:
grid_param = {
    'n_estimators': [100, 300, 500, 800, 1000],
    'bootstrap': [True, False]
}
pipeline=Pipeline([ ('lin_svc',lsvc) ])
strarifiedCV = StratifiedShuffleSplit(n_splits=10,test_size=0.2, random_state=25)
grid_search = GridSearchCV(pipeline,param_grid=param_grid,verbose=3,scoring='accuracy',
                           cv =strarifiedCV,n_jobs=-1).fit(X_train, y_train)
print('-----grid search end------------')

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:    3.2s remaining:    7.6s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    3.2s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    3.2s finished


-----grid search end------------


In [59]:
#f1_score(y_val, lsvc_pred, average="macro")

print ('on all train set')
scores = cross_val_score(grid_search.best_estimator_, X_train, y_train,cv=3,scoring='accuracy')
print (scores.mean(),scores)
print ('on test set')
scores = cross_val_score(grid_search.best_estimator_, X_val, y_val,cv=3,scoring='accuracy')
print (scores.mean(),scores)

on all train set
0.7361148740178813 [0.73503116 0.73259279 0.74072067]
on test set
0.6805731142014327 [0.68141593 0.68078382 0.6795196 ]


In [60]:
from sklearn import metrics

#print(metrics.classification_report(y_val, lsvc_pred))

## Transform Test Data with same vectorizer

In [40]:
testx = test['message']
test_vect = vectorizer.transform(testx)

## Predict Vectorized Test Data Using Grid_search(Best Result)

In [61]:
#y_pred = lsvc.predict(test_vect)
y_pred = grid_search.predict(test_vect)


In [62]:
test['sentiment'] = y_pred

In [43]:
test.head()

Unnamed: 0,message,tweetid,sentiment
0,Europe will now be looking to China to make su...,169760,1
1,Combine this with the polling of staffers re c...,35326,1
2,"The scary, unimpeachable evidence that climate...",224985,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0


In [63]:
test[['tweetid','sentiment']].to_csv('lsvc_gridcv.csv', index=False)

## A Try at multi Classifier prediction(Using Grid Search CV for the K-Folding, no Parameter Optimizing yet)

In [81]:
classifiers = [
    LogisticRegression(max_iter=15000), 
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),    
    AdaBoostClassifier()
]
names = ['Logistic Regression', 'Nearest Neighbors', 
         'Linear SVM', 'RBF SVM',          
         'Decision Tree', 'Random Forest',  'AdaBoost']

In [99]:
results = []

models = {}
confusion = {}
class_report = {}


for name, clf in zip(names, classifiers):    
    print ('Fitting {:s} model...'.format(name))
    #run_time = %timeit -q -o clf.fit(X_train, y_train)
    grid_param = {
    }
    
    pipeline=Pipeline([ (name,clf) ])
    strarifiedCV = StratifiedShuffleSplit(n_splits=10,test_size=0.2, random_state=25)
    grid_search = GridSearchCV(pipeline,param_grid=grid_param,verbose=3,scoring='accuracy',
                           cv =5,n_jobs=-1).fit(X_train, y_train)
    print ('... predicting')
    y_pred = grid_search.predict(X_train)   
    y_pred_test = grid_search.predict(X_val)
    
    print ('... scoring')
    accuracy  = metrics.accuracy_score(y_train, y_pred)
    precision = metrics.precision_score(y_train, y_pred,average='macro')
    recall    = metrics.recall_score(y_train, y_pred,average='macro')
    
    f1        = metrics.f1_score(y_train, y_pred,average='macro')    
    f1_test   = metrics.f1_score(y_val, y_pred_test,average='macro')    
    
    # Save the results to dictionaries
    models[name] = clf    
    confusion[name] = metrics.confusion_matrix(y_train, y_pred)
    class_report[name] = metrics.classification_report(y_train, y_pred)
    
    results.append([name, accuracy, precision, recall, f1, f1_test, run_time.best,grid_search,grid_search.best_score_,grid_search.best_params_])

    


Fitting Logistic Regression model...
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  25 | elapsed:   17.5s remaining:   45.1s
[Parallel(n_jobs=-1)]: Done  16 out of  25 | elapsed:   17.5s remaining:    9.8s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   17.5s remaining:    0.0s


ValueError: Invalid parameter n_estimators for estimator Pipeline(steps=[('Logistic Regression', LogisticRegression(max_iter=15000))]). Check the list of available parameters with `estimator.get_params().keys()`.

In [95]:

best_result = grid_search.best_score_
print(best_result)

0.6312415349887134


In [83]:
results = pd.DataFrame(results, columns=['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1 Train', 'F1 Test', 'Train Time','GS','GS_BEST_SCORE','GS_BEST_PARAM'])
results.set_index('Classifier', inplace= True)

In [84]:
results.sort_values('F1 Train', ascending=False)

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Train,F1 Test,Train Time,GS,GS_BEST_SCORE,GS_BEST_PARAM
Classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
RBF SVM,0.999187,0.999235,0.998359,0.998796,0.415228,161.674237,GridSearchCV(cv=StratifiedShuffleSplit(n_split...,0.643476,{}
Logistic Regression,0.87158,0.934066,0.746435,0.805709,0.532846,161.674237,GridSearchCV(cv=StratifiedShuffleSplit(n_split...,0.703025,{}
Nearest Neighbors,0.802944,0.748375,0.750942,0.747698,0.541122,161.674237,GridSearchCV(cv=StratifiedShuffleSplit(n_split...,0.632641,{}
AdaBoost,0.643909,0.613392,0.497596,0.525441,0.49393,161.674237,GridSearchCV(cv=StratifiedShuffleSplit(n_split...,0.631242,{}
Decision Tree,0.610494,0.710497,0.386227,0.404241,0.36911,161.674237,GridSearchCV(cv=StratifiedShuffleSplit(n_split...,0.596208,{}
Linear SVM,0.538066,0.134516,0.25,0.174916,0.175731,161.674237,GridSearchCV(cv=StratifiedShuffleSplit(n_split...,0.538149,{}
Random Forest,0.538066,0.134516,0.25,0.174916,0.175731,161.674237,GridSearchCV(cv=StratifiedShuffleSplit(n_split...,0.538149,{}


In [86]:
test_md = results.loc['RBF SVM']['GS']

In [87]:
test_md

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=10, random_state=25, test_size=0.2,
            train_size=None),
             estimator=Pipeline(steps=[('RBF SVM', SVC(C=1, gamma=2))]),
             n_jobs=-1, param_grid={}, scoring='accuracy', verbose=3)