# References 
## Source : 
        [Twitter Sentiment Analysis](https://datahack.analyticsvidhya.com/contest/practice-problem-twitter-sentiment-analysis/)
## Resources :
1. https://www.geeksforgeeks.org/twitter-sentiment-analysis-using-python/
2. https://chrisalbon.com/python/data_wrangling/pandas_apply_operations_to_dataframes/
3. https://www.analyticsvidhya.com/blog/2014/11/text-data-cleaning-steps-python/
4. https://machinelearningmastery.com/clean-text-machine-learning-python/
5. https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a
6. http://scikit-learn.org/stable/modules/cross_validation.html
7. http://scikit-learn.org/stable/modules/pipeline.html
8. http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html

In [198]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB  # Naive Bayes
from sklearn.naive_bayes import BernoulliNB  # Bernoulli Naive Bayes
from sklearn.metrics import precision_recall_fscore_support as pr
from sklearn.linear_model import SGDClassifier  # Lineaer Classifier
from sklearn.model_selection import GridSearchCV  # GridSearchCV
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.tree import DecisionTreeClassifier # Decision Tree 
from sklearn.svm import LinearSVC # Support Vector Classification
from sklearn.neural_network import MLPClassifier # Multi Layer Neural Network
from sklearn.model_selection import cross_val_score

In [295]:
# to update my nltk data folder
# nltk.download()

pd.set_option('max_colwidth', 280)
dataPath = "data\\"

# Preprocessing Steps : Data Cleaning, Sampling

In [296]:
trainingDataPath = dataPath + "train.csv"

In [297]:
data = pd.read_csv(trainingDataPath,encoding='utf-8')

In [298]:
apostrophe = { 
    "'m":"am","'d":"had","'s":"is","'ve":"have","'ll":"will","'re":"are","'t":"not"
}
# special cases are can't won't ain't
# made using list under https://www.panopy.com/iphone/secret-ada/cracking-a-cipher.html

# Stop words in english Language
stop_words = stopwords.words('english')
# print(stop_words)
# Stemming Engine
stemmer = SnowballStemmer("english")

In [301]:
def apostropheCleaner(x):
    x = x.lower()
    # Step 1 : removing apostrophe
    words = x.split()
    reformed = []
    flag = False
    for word in words:
        flag = False
        for k in apostrophe:
            if k in word:
                flag = True
                if k == "'t":
                    # for type : can't,won't,ain't
                    if word == "can't":
                        reformed.append("can not")
                    elif word == "won't":
                        reformed.append("will not")
                    elif word == "ain't":
                        reformed.append("is not")
                    else:
                        reformed.append(word.split("'")[0][:-1]+" "+apostrophe[k])
                else:
                    reformed.append(word.split("'")[0]+" "+apostrophe[k])
                break
        if not flag:
            reformed.append(word)
    return " ".join(reformed)

def linkCleaner(x):
    # Step 2: URL removal
    # removing url,usernames and hashtag
    return re.sub(r"http\S+|@\S+", "", x)
    
def hashtagExtract(x):    
    return " ".join(set(re.findall(r"#(\w+)", x)))
    
def wordCleaner(x):
    # remove the hadhtags that remained with the sentence
    # x = re.sub(r"#\S+", "", x)
    
    # # Step 3: Removing Punctuation
    # tokens = word_tokenize(reform)
    # words = [w for w in tokens if w.isalpha()]
    
    # # Step 4: Stemming Words
    # stemmed = [stemmer.stem(word) for word in words]
    
    # # Step 5: Removing Stop Words
    # words = [w for w in stemmed if not w in stop_words]
    
    # Combining Step 3,4,5
    tokens = word_tokenize(x)
    words = []
    for w in tokens:
        if w.isalpha() and w not in stop_words:
            # words.append(stemmer.stem(w))   //// ignoring stemming for now
            words.append(w)
    x = " ".join(words)
    return x

In [302]:
data['cleanedTweet'] = data['tweet'].transform(apostropheCleaner)

data['cleanedTweet'] = data['cleanedTweet'].transform(linkCleaner)

data['hastag'] = data['cleanedTweet'].transform(hashtagExtract)

data['cleanedTweet'] = data['cleanedTweet'].transform(wordCleaner)

# data[data['label'] == 1]

features = data['cleanedTweet']
label = data['label']

In [307]:
print(features.shape)
# print(features)

# print(label.shape)

(31962,)
0                                                              father dysfunctional selfish drags kids dysfunction run
1                                        thanks lyft credit use cause offer wheelchair vans pdx disapointed getthanked
2                                                                                                       bihday majesty
3                                                                                             model love u take u time
4                                                                                        factsguide society motivation
5                                                huge fan fare big talking leave chaos pay disputes get allshowandnogo
6                                                                                                     camping tomorrow
7                               next school year year think school exams hate imagine actorslife revolutionschool girl
8                                      

In [304]:
# Splitting the data using Sklearn Model Selection, this should give us better results for cross valication
X_train,X_test,Y_train,Y_test = train_test_split(features, label, test_size=0.2)

In [305]:
print(X_train.shape,Y_train.shape,X_test.shape,Y_test.shape)

(25569,) (25569,) (6393,) (6393,)


## Remember 
    When data set is very skewed like in current scenario where there are very low number of `class 1` and very large number of `class 0`, 
    rather than calculating accuracy, we should be measuring the model using f1 Score, which is calculated using Precision and Recall
### Precision :
    * High Precision means low false positive
    * It means low wrong prediction for class 0
### Recall :
    * High Recall means low false negative
    * It means low wrong prediction for class 1

# Naive Bayes

In [306]:
# Naive Bayes
text_clf_nb = Pipeline([
                        ('vect', CountVectorizer(ngram_range=(1, 2))),
                        ('tfidf', TfidfTransformer(use_idf=True)),
                        ('clf_nb', MultinomialNB(alpha = 0.01))
                       ])
print(cross_val_score(text_clf_nb,X_train,Y_train,cv=5,scoring='f1'))
text_clf_nb = text_clf_nb.fit(X_train,Y_train)



[ 0.7073955   0.67779633  0.66777963  0.67343486  0.69666667]


In [253]:
predicted_nb = text_clf_nb.predict(X_test)
print("Classified as Class 1 :", sum(predicted_nb))
print("Total Actual Class 1 :",sum(Y_test))
# F1 Score, Precision, Recall
Precision, Recall, F1score, Support = pr(Y_test, predicted_nb, average='binary', pos_label=1,beta=1.0)
print("Values for Naive Bayes : ","Precision ->",Precision," Recall->",Recall," F1score->",F1score)

Classified as Class 1 : 308
Total Actual Class 1 : 478
Values for Naive Bayes :  Precision -> 0.863636363636  Recall-> 0.556485355649  F1score-> 0.676844783715


# Linear Classifier

In [227]:
# Linear CLassifer
text_clf_linearClassifier = Pipeline([
                                        ('vect', CountVectorizer()), 
                                        ('tfidf', TfidfTransformer()), 
                                        ('clf-lc', SGDClassifier(loss='perceptron', penalty='elasticnet', alpha=0.0001, max_iter=1000, tol=0.001))
                                    ])
text_clf_linearClassifier = text_clf_linearClassifier.fit(X_train,Y_train)

In [228]:
predicted_linearClassifier = text_clf_linearClassifier.predict(X_test)
print("Classified as Class 1 :", sum(predicted_linearClassifier))
print("Total Actual Class 1 :",sum(Y_test))
# F1 Score, Precision, Recall
Precision, Recall, F1score, Support = pr(Y_test, predicted_linearClassifier, average='binary', pos_label=1,beta=1.0)
print("Values for Linear Classifier : ","Precision ->",Precision," Recall->",Recall," F1score->",F1score)

Classified as Class 1 : 420
Total Actual Class 1 : 478
Values for Linear Classifier :  Precision -> 0.602380952381  Recall-> 0.529288702929  F1score-> 0.563474387528


# Random Forset

In [189]:
# Random Forest
clf_rf = Pipeline([
                        ('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf_rf', RandomForestClassifier(n_estimators=100, max_features=2))
                 ])
clf_rf = clf_rf.fit(X_train,Y_train)

In [190]:
predicted_rf = clf_rf.predict(X_test)
print("Classified as Class 1 :", sum(predicted_rf))
print("Total Actual Class 1 :",sum(Y_test))
# F1 Score, Precision, Recall
Precision, Recall, F1score, Support = pr(Y_test, predicted_rf, average='binary', pos_label=1,beta=1.0)
print("Values for Random Forest : ","Precision ->",Precision," Recall->",Recall," F1score->",F1score)

Classified as Class 1 : 167
Total Actual Class 1 : 478
Values for Random Forest :  Precision -> 0.964071856287  Recall-> 0.336820083682  F1score-> 0.499224806202


# DecisionTree Classifier

In [200]:
# DecisionTreeClassifier
clf_dt = Pipeline([
                        ('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf_dt', DecisionTreeClassifier())
                 ])
print(cross_val_score(clf_dt,X_train,Y_train,cv=5,scoring='f1'))
clf_dt = clf_dt.fit(X_train,Y_train)

array([ 0.56929955,  0.64619883,  0.59167951,  0.59411765,  0.6185567 ])

In [204]:
predicted_dt = clf_dt.predict(X_test)
print("Classified as Class 1 :", sum(predicted_dt))
print("Total Actual Class 1 :",sum(Y_test))
# F1 Score, Precision, Recall
Precision, Recall, F1score, Support = pr(Y_test, predicted_rf, average='binary', pos_label=1,beta=1.0)
print("Values for Random Forest : ","Precision ->",Precision," Recall->",Recall," F1score->",F1score)

Classified as Class 1 : 415
Total Actual Class 1 : 478
Values for Random Forest :  Precision -> 0.964071856287  Recall-> 0.336820083682  F1score-> 0.499224806202


# Bernoulli Naive Bayes

In [285]:
# Bernoulli Naive Bayes
clf_bnb = Pipeline([
                        ('vect', CountVectorizer(ngram_range=(1,2))),
                        ('tfidf', TfidfTransformer(use_idf=True)),
                        ('clf_bnb', BernoulliNB(alpha = 0.0117))
                 ])
print(cross_val_score(clf_bnb,X_train,Y_train,cv=5,scoring='f1'))
clf_bnb = clf_bnb.fit(X_train,Y_train)

[ 0.7217806   0.72281776  0.7073955   0.68454259  0.70684039]


In [286]:
predicted_bnb = clf_bnb.predict(X_test)
print("Classified as Class 1 :", sum(predicted_bnb))
print("Total Actual Class 1 :",sum(Y_test))
# F1 Score, Precision, Recall
Precision, Recall, F1score, Support = pr(Y_test, predicted_bnb, average='binary', pos_label=1,beta=1.0)
print("Values for Bernoulli Naive Bayes : ","Precision ->",Precision," Recall->",Recall," F1score->",F1score)

Classified as Class 1 : 372
Total Actual Class 1 : 478
Values for Bernoulli Naive Bayes :  Precision -> 0.790322580645  Recall-> 0.615062761506  F1score-> 0.691764705882


# Support Vector Classification

In [180]:
# Support Vector Classification
clf_svc = Pipeline([
                        ('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf_svc', LinearSVC())
                 ])
clf_svc = clf_svc.fit(X_train,Y_train)

In [181]:
predicted_svc = clf_svc.predict(X_test)
print("Classified as Class 1 :", sum(predicted_svc))
print("Total Actual Class 1 :",sum(Y_test))
# F1 Score, Precision, Recall
Precision, Recall, F1score, Support = pr(Y_test, predicted_svc, average='binary', pos_label=1,beta=1.0)
print("Values for Bernoulli Naive Bayes : ","Precision ->",Precision," Recall->",Recall," F1score->",F1score)

Classified as Class 1 : 305
Total Actual Class 1 : 478
Values for Bernoulli Naive Bayes :  Precision -> 0.865573770492  Recall-> 0.55230125523  F1score-> 0.674329501916


# Multi-layer Perceptron classifier (Neural Network)

In [194]:
# Support Vector Classification
clf_nn = Pipeline([
                        ('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf_nn', MLPClassifier(hidden_layer_sizes=20))
                 ])
clf_nn = clf_nn.fit(X_train,Y_train)

In [195]:
predicted_nn = clf_nn.predict(X_test)
print("Classified as Class 1 :", sum(predicted_nn))
print("Total Actual Class 1 :",sum(Y_test))
# F1 Score, Precision, Recall
Precision, Recall, F1score, Support = pr(Y_test, predicted_nn, average='binary', pos_label=1,beta=1.0)
print("Values for Bernoulli Naive Bayes : ","Precision ->",Precision," Recall->",Recall," F1score->",F1score)

Classified as Class 1 : 419
Total Actual Class 1 : 478
Values for Bernoulli Naive Bayes :  Precision -> 0.692124105012  Recall-> 0.606694560669  F1score-> 0.646599777035


# GridSearch 

It is used for finding the best fit of parameter for the defined estimator (predictor/model/classifier) from a range defined under param_grid for the corresponding estimator.

The Following are the parameters defined for GridSearch :
*   Defined to run on multi core
*   Uses f1 scoring
*   4-fold cross validation
*   It is resistent to model failures (if it does) for any combination of parameters within the range it is searching for

## GridSearch For Naive Bayes 

In [41]:
# Listing the Parameters for setting up parameters for `param_grid`
# text_clf_nb.get_params()

In [248]:
param_grid = {
    'vect__ngram_range': [(1, 3), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf_nb__alpha': (1e-2, 1e-3, 0.015)
}
gs_clf_nb = GridSearchCV(text_clf_nb, param_grid, n_jobs=-1, cv=4, scoring='f1', error_score=np.NaN, refit=True)
gs_clf_nb = gs_clf_nb.fit(X_train,Y_train)

In [249]:
# print(gs_clf.best_score_,gs_clf.best_params_,gs_clf.best_estimator_)
print(gs_clf_nb.best_score_,gs_clf_nb.best_params_)

0.687984527233 {'clf_nb__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}


In [250]:
# As refit is True, the estimators are already trained with the best possible parameters within the defined range 
# and we can use predict fuction directly 
predicted_gs_clf_nb = gs_clf_nb.predict(X_test)
print("Classified as Class 1 :", sum(predicted_gs_clf_nb))
print("Total Actual Class 1 :",sum(Y_test))

Classified as Class 1 : 308
Total Actual Class 1 : 478


In [251]:
# F1 Score, Precision, Recall
Precision, Recall, F1score, Support = pr(Y_test, predicted_gs_clf_nb, average='binary', pos_label=1,beta=1.0)
print("Values for Best Parameters from Grid Search Naive Bayes : ","Precision ->",Precision," Recall->",Recall," F1score->",F1score)

Values for Best Parameters from Grid Search Naive Bayes :  Precision -> 0.863636363636  Recall-> 0.556485355649  F1score-> 0.676844783715


## GridSearch For Bernoulli Naive Bayes 

In [115]:
# Listing the Parameters for setting up parameters for `param_grid`
# clf_bnb.get_params()

In [281]:
param_grid = {
    'vect__ngram_range': [(1, 2),],
    'tfidf__use_idf': (True,),
    'clf_bnb__alpha': (0.0117, 0.01175, 0.0118)
}
gs_clf_bnb = GridSearchCV(clf_bnb, param_grid, n_jobs=-1, cv=4, scoring='f1', error_score=np.NaN, refit=True)
gs_clf_bnb = gs_clf_bnb.fit(X_train,Y_train)

In [282]:
# print(gs_clf.best_score_,gs_clf.best_params_,gs_clf.best_estimator_)
print(gs_clf_bnb.best_score_,gs_clf_bnb.best_params_)

0.710920222142 {'clf_bnb__alpha': 0.0117, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}


In [283]:
# As refit is True, the estimators are already trained with the best possible parameters within the defined range 
# and we can use predict fuction directly 
predicted_gs_clf_bnb = gs_clf_bnb.predict(X_test)
print("Classified as Class 1 :", sum(predicted_gs_clf_bnb))
print("Total Actual Class 1 :",sum(Y_test))

Classified as Class 1 : 372
Total Actual Class 1 : 478


In [284]:
# F1 Score, Precision, Recall
Precision, Recall, F1score, Support = pr(Y_test, predicted_gs_clf_bnb, average='binary', pos_label=1,beta=1.0)
print("Values for Best Parameters from Grid Search Naive Bayes : ","Precision ->",Precision," Recall->",Recall," F1score->",F1score)

Values for Best Parameters from Grid Search Naive Bayes :  Precision -> 0.790322580645  Recall-> 0.615062761506  F1score-> 0.691764705882


## GridSearch For Linear Classifier

In [60]:
# List of parameters for Linear Classifer 
# text_clf_linearClassifier.get_params()


In [242]:
param_grid = {
    'vect__ngram_range': [(1, 2),(1,3)],
    'tfidf__use_idf': (True,),
    'clf-lc__alpha': (0.015,0.02),
    'clf-lc__loss': ('perceptron',),
    'clf-lc__max_iter': (1000,),
    'clf-lc__penalty': ('l2',),
    'clf-lc__tol': (0.001,),
}
# using just one core as CPU percent usage was very high 
gs_linearClassifier = GridSearchCV(text_clf_linearClassifier, param_grid, n_jobs=1, cv=4, scoring='f1', error_score=np.NaN, refit=True)
gs_linearClassifier = gs_linearClassifier.fit(X_train,Y_train)

In [243]:
# print(gs_clf.best_score_,gs_clf.best_params_,gs_clf.best_estimator_)
print(gs_linearClassifier.best_score_,gs_linearClassifier.best_params_)

0.644580739072 {'clf-lc__alpha': 0.02, 'clf-lc__loss': 'perceptron', 'clf-lc__max_iter': 1000, 'clf-lc__penalty': 'l2', 'clf-lc__tol': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 3)}


In [244]:
# As refit is True, the estimators are already trained with the best possible parameters within the defined range 
# and we can use predict fuction directly 
predicted_gs_linearClassifier = gs_linearClassifier.predict(X_test)
print("Classified as Class 1 :", sum(predicted_gs_linearClassifier))
print("Total Actual Class 1 :",sum(Y_test))

Classified as Class 1 : 535
Total Actual Class 1 : 478


In [245]:
# F1 Score, Precision, Recall
Precision, Recall, F1score, Support = pr(Y_test, predicted_gs_linearClassifier, average='binary', pos_label=1,beta=1.0)
print("Values for Best Parameters from Grid Search Linear Classifier : ","Precision ->",Precision," Recall->",Recall," F1score->",F1score)

Values for Best Parameters from Grid Search Linear Classifier :  Precision -> 0.629906542056  Recall-> 0.705020920502  F1score-> 0.665350444225


In [287]:
# Submission Test Tweets Prep 
submissionDataPath = dataPath + "test.csv"
submissionData = pd.read_csv(submissionDataPath,encoding='utf-8')


submissionData['cleanedTweet'] = submissionData['tweet'].transform(apostropheCleaner)
submissionData['cleanedTweet'] = submissionData['cleanedTweet'].transform(linkCleaner)
submissionData['hastag'] = submissionData['cleanedTweet'].transform(hashtagExtract)
submissionData['cleanedTweet'] = submissionData['cleanedTweet'].transform(wordCleaner)

# retraining on complete original data with best parameters
clf_bnb = clf_bnb.fit(features,label)

# predicting the class for the test data
submissionData['label'] =  clf_bnb.predict(submissionData['cleanedTweet'])
submission = submissionData[["id","label"]].set_index("id")

# submissionData.describe()
submission.to_csv(dataPath+"submission.csv")