# References 
## Source : 
        [Twitter Sentiment Analysis](https://datahack.analyticsvidhya.com/contest/practice-problem-twitter-sentiment-analysis/)
## Resources :
1. https://www.geeksforgeeks.org/twitter-sentiment-analysis-using-python/
2. https://chrisalbon.com/python/data_wrangling/pandas_apply_operations_to_dataframes/
3. https://www.analyticsvidhya.com/blog/2014/11/text-data-cleaning-steps-python/
4. https://machinelearningmastery.com/clean-text-machine-learning-python/
5. https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a
6. http://scikit-learn.org/stable/modules/cross_validation.html
7. http://scikit-learn.org/stable/modules/pipeline.html
8. http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB  # Naive Bayes
from sklearn.metrics import precision_recall_fscore_support as pr
from sklearn.linear_model import SGDClassifier  # Lineaer Classifier
from sklearn.model_selection import GridSearchCV  # GridSearchCV


In [2]:
# to update my nltk data folder
# nltk.download()

pd.set_option('max_colwidth', 280)
dataPath = "E:\\GitHub\\Personal-Development\\Anaconda-Projects\\AnalyticsVidhya\\Twitter-Sentiment-Analysis\\data\\"

# Preprocessing Steps : Data Cleaning, Sampling

In [3]:
trainingDataPath = dataPath + "train.csv"

In [4]:
data = pd.read_csv(trainingDataPath,encoding='utf-8')

In [5]:
apostrophe = { 
    "'m":"am","'d":"had","'s":"is","'ve":"have","'ll":"will","'re":"are","'t":"not"
}
# special cases are can't won't ain't
# made using list under https://www.panopy.com/iphone/secret-ada/cracking-a-cipher.html

# Stop words in english Language
stop_words = stopwords.words('english')
# print(stop_words)
# Stemming Engine
stemmer = SnowballStemmer("english")

In [6]:
def apostropheCleaner(x):
    x = x.lower()
    # Step 1 : removing apostrophe
    words = x.split()
    reformed = []
    flag = False
    for word in words:
        flag = False
        for k in apostrophe:
            if k in word:
                flag = True
                if k == "'t":
                    # for type : can't,won't,ain't
                    if word == "can't":
                        reformed.append("can not")
                    elif word == "won't":
                        reformed.append("will not")
                    elif word == "ain't":
                        reformed.append("is not")
                    else:
                        reformed.append(word.split("'")[0][:-1]+" "+apostrophe[k])
                else:
                    reformed.append(word.split("'")[0]+" "+apostrophe[k])
                break
        if not flag:
            reformed.append(word)
    return " ".join(reformed)

def linkCleaner(x):
    # Step 2: URL removal
    # removing url,usernames and hashtag
    return re.sub(r"http\S+|@\S+", "", x)
    
def hashtagExtract(x):    
    return " ".join(set(re.findall(r"#(\w+)", x)))
    
def wordCleaner(x):
    # remove the hadhtags that remained with the sentence
    # x = re.sub(r"#\S+", "", x)
    
    # # Step 3: Removing Punctuation
    # tokens = word_tokenize(reform)
    # words = [w for w in tokens if w.isalpha()]
    
    # # Step 4: Stemming Words
    # stemmed = [stemmer.stem(word) for word in words]
    
    # # Step 5: Removing Stop Words
    # words = [w for w in stemmed if not w in stop_words]
    
    # Combining Step 3,4,5
    tokens = word_tokenize(x)
    words = []
    for w in tokens:
        if w.isalpha() and w not in stop_words:
            # words.append(stemmer.stem(w))   //// ignoring stemming for now
            words.append(w)
    x = " ".join(words)
    return x

In [7]:
data['cleanedTweet'] = data['tweet'].transform(apostropheCleaner)

data['cleanedTweet'] = data['cleanedTweet'].transform(linkCleaner)

data['hastag'] = data['cleanedTweet'].transform(hashtagExtract)

data['cleanedTweet'] = data['cleanedTweet'].transform(wordCleaner)

# data[data['label'] == 1]

features = data['cleanedTweet']
label = data['label']

print(features.shape)
print(label.shape)

(31962,)
(31962,)


In [27]:
# Splitting the data using Sklearn Model Selection, this should give us better results for cross valication
X_train,X_test,Y_train,Y_test = train_test_split(features, label, test_size=0.2)

In [28]:
print(X_train.shape,Y_train.shape,X_test.shape,Y_test.shape)

(25569,) (25569,) (6393,) (6393,)


## Remember 
    When data set is very skewed like in current scenario where there are very low number of `class 1` and very large number of `class 0`, 
    rather than calculating accuracy, we should be measuring the model using f1 Score, which is calculated using Precision and Recall
### Precision :
    * High Precision means low false positive
    * It means low wrong prediction for class 0
### Recall :
    * High Recall means low false negative
    * It means low wrong prediction for class 1

# Naive Bayes

In [29]:
# Naive Bayes
text_clf_nb = Pipeline([
                        ('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf_nb', MultinomialNB(alpha = 0.1))
                       ])
text_clf_nb = text_clf_nb.fit(X_train,Y_train)

In [30]:
predicted_nb = text_clf_nb.predict(X_test)
print("Classified as Class 1 :", sum(predicted_nb))
print("Total Actual Class 1 :",sum(Y_test))
# F1 Score, Precision, Recall
Precision, Recall, F1score, Support = pr(Y_test, predicted_nb, average='binary', pos_label=1,beta=1.0)
print("Values for Naive Bayes : ","Precision ->",Precision," Recall->",Recall," F1score->",F1score)

Classified as Class 1 : 267
Total Actual Class 1 : 463
Values for Naive Bayes :  Precision -> 0.895131086142  Recall-> 0.516198704104  F1score-> 0.654794520548


# Linear Classifier
This Model can work as SVM too, with hinge loss

In [31]:
# Linear CLassifer
text_clf_linearClassifier = Pipeline([
                                        ('vect', CountVectorizer()), 
                                        ('tfidf', TfidfTransformer()), 
                                        ('clf-svm', SGDClassifier(loss='perceptron', penalty='elasticnet', alpha=0.001, max_iter=1000, tol=0.001))
                                    ])
text_clf_linearClassifier = text_clf_linearClassifier.fit(X_train,Y_train)

In [32]:
predicted_linearClassifier = text_clf_linearClassifier.predict(X_test)
print("Classified as Class 1 :", sum(predicted_linearClassifier))
print("Total Actual Class 1 :",sum(Y_test))
# F1 Score, Precision, Recall
Precision, Recall, F1score, Support = pr(Y_test, predicted_linearClassifier, average='binary', pos_label=1,beta=1.0)
print("Values for Linear Classifier : ","Precision ->",Precision," Recall->",Recall," F1score->",F1score)

Classified as Class 1 : 129
Total Actual Class 1 : 463
Values for Linear Classifier :  Precision -> 0.744186046512  Recall-> 0.207343412527  F1score-> 0.324324324324


# GridSearch 
It is used for finding the best fit of parameter for the defined estimator(predictor/model/classifier) from a range defined under `param_grid`

#### GridSearch For Naive Bayes

*   Defined to run on multi core
*   Uses f1 scoring
*   4-fold cross validation
*   It is resistent to model failures (if it does) for any combination of parameters within the range it is searching for 

In [33]:
# Listing the Parameters for setting up parameters for `param_grid`
# text_clf_nb.get_params()

In [34]:
# GridSearchCV
param_grid = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf_nb__alpha': (1e-2, 1e-3)
}
gs_clf = GridSearchCV(text_clf_nb, param_grid, n_jobs=-1, cv=4, scoring='f1', error_score=np.NaN, refit=True)
gs_clf = gs_clf.fit(X_train,Y_train)

In [35]:
# print(gs_clf.best_score_,gs_clf.best_params_,gs_clf.best_estimator_)
print(gs_clf.best_score_,gs_clf.best_params_)

0.665723286674 {'clf_nb__alpha': 0.01, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}


In [36]:
# As refit is True, the estimators are already trained with the best possible parameters within the defined range 
# and we can use predict fuction directly 
predicted_gs_clf = gs_clf.predict(X_test)
print("Classified as Class 1 :", sum(predicted_gs_clf))
print("Total Actual Class 1 :",sum(Y_test))

Classified as Class 1 : 315
Total Actual Class 1 : 463


In [37]:
# F1 Score, Precision, Recall
Precision, Recall, F1score, Support = pr(Y_test, predicted_gs_clf, average='binary', pos_label=1,beta=1.0)
print("Values for Linear Classifier : ","Precision ->",Precision," Recall->",Recall," F1score->",F1score)
print("high precision means low false positive(means low wrong prediction for class 0), high recall means low false negative(means low wrong prediction for class 1)")

Values for Linear Classifier :  Precision -> 0.898412698413  Recall-> 0.611231101512  F1score-> 0.727506426735
high precision means low false positive(means low wrong prediction for class 0), high recall means low false negative(means low wrong prediction for class 1)


In [None]:
# text_clf_linearClassifier.get_params()


In [None]:
# # Submission Test Tweets Prep 
# submissionDataPath = dataPath + "test.csv"
# submissionData = pd.read_csv(submissionDataPath,encoding='utf-8')


# submissionData['cleanedTweet'] = submissionData['tweet'].transform(apostropheCleaner)
# submissionData['cleanedTweet'] = submissionData['cleanedTweet'].transform(linkCleaner)
# submissionData['hastag'] = submissionData['cleanedTweet'].transform(hashtagExtract)
# submissionData['cleanedTweet'] = submissionData['cleanedTweet'].transform(wordCleaner)


# submissionData['label'] =  gs_clf.predict(submissionData['cleanedTweet'])
# submission = submissionData[["id","label"]].set_index("id")

# # submissionData.describe()
# submission.to_csv(dataPath+"submission.csv")