##### Authors: 
- Vikram Hanumanthrao Patil
- Prashantkumar Kulkarni

##### Date: 2/6/2019

##### Version: 3.0

##### Environment: Python 3.6.1 and Jupyter notebook

# Table of contents
### 1. [Importing libraries](#library)
### 2. [Initialization](#initialisation)
### 3. [Read training and label](#read_train)
### 4. [Data pre-processing](#preprocess)   
### 5. [Feature generation](#feature)
- #### 5.1 [Dimention reduction technique(Chi-squared)](#dimension)
- #### 5-2 [Multinomial logistic regression](#model)
- #### 5-3 [Cross-validation](#cv)         

### 6. [Predict on test data](#test)

## 1. Importing libraries <a name="library"></a>

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from pattern.en import parse
from nltk.corpus import stopwords
import string
import re
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn import svm
import swifter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn import metrics
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, chi2
import warnings
warnings.filterwarnings("ignore")



## 2. Initialization<a name="initialisation"></a>

### Creating a custom dictionary to expand all the decontract words

In [2]:
#initialising the lemmatizer.
wn = nltk.WordNetLemmatizer()

# Creating a custom dictionary to expand all the decontract words
appos = {
"aren't" : "are not", "can't" : "cannot", "couldn't" : "could not", "didn't" : "did not", "doesn't" : "does not",
"don't" : "do not", "hadn't" : "had not", "hasn't" : "has not", "haven't" : "have not",
"he'd" : "he would", "he'll" : "he will", "he's" : "he is", "i'd" : "I would",
"i'd" : "I had", "i'll" : "I will", "i'm" : "I am", "isn't" : "is not",
"it's" : "it is", "it'll":"it will", "i've" : "I have", "let's" : "let us",
"mightn't" : "might not", "mustn't" : "must not", "shan't" : "shall not", "she'd" : "she would",
"she'll" : "she will", "she's" : "she is", "shouldn't" : "should not", "that's" : "that is",
"there's" : "there is", "they'd" : "they would", "they'll" : "they will", "they're" : "they are",
"they've" : "they have", "we'd" : "we would", "we're" : "we are", "weren't" : "were not",
"we've" : "we have", "what'll" : "what will", "what're" : "what are", "what's" : "what is",
"what've" : "what have", "where's" : "where is", "who'd" : "who would", "who'll" : "who will",
"who're" : "who are", "who's" : "who is", "who've" : "who have", "won't" : "will not",
"wouldn't" : "would not", "you'd" : "you would", "you'll" : "you will","you're" : "you are",
"you've" : "you have", "'re": " are", "wasn't": "was not", "we'll":" will","didn't": "did not"
}
#reference[1]

## 3. Reading the training data and labels <a name="read_train"></a>

### merging both of them

In [3]:
data = pd.read_csv("train_data.csv", sep=',') # read training data
data_labels = pd.read_csv("train_label.csv", sep=',') # read training labels
df=pd.merge(data,data_labels,on='trn_id',how='left') # merging both of them

## 4. Data pre-processing <a name="preprocess"></a>

In [4]:
#--------------------------
# Data pre-processing step
#--------------------------
def pre_process(text):
    """
    Takes in a string of text, then performs the following:
    1. converts to lower
    2. Splits the sentence into tokens
    3. Decontract the words. For example: "won't" --> "will not"
    4. Lemmatization, reduces words to their base word
    5. Returns the sentence of the cleaned text
    """
    text = "".join([word.lower() for word in text])
    tokens = text.split(" ")
    tokens = [appos[word] if word in appos else word for word in tokens]
    text = " ".join([wn.lemmatize(word) for word in tokens])   
    return text


#--------------------------
# execute pre-processing
#--------------------------
df['text']=df.swifter.apply(lambda x:pre_process(x['text']),axis=1) 


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=650000, style=ProgressStyle(description_wi…




## 5. Feature generation <a name="feature"></a>

### 5.1- Dimension reduction technique (Chi-square)<a name="dimension"></a>

In [5]:
#--------------------------------------
#dimension reduction using chi-square
#--------------------------------------


x_train, x_validation, y_train, y_validation = train_test_split(df['text'], df['label'], test_size=.02)

tvec = TfidfVectorizer(max_features=100000,ngram_range=(1, 3))
x_train_tfidf = tvec.fit_transform(x_train)
x_validation_tfidf = tvec.transform(x_validation)

#reference[2]

### 5-2 Multinomial logistic regression<a name="model"></a>

In [6]:
ch = SelectKBest(chi2, k=40000)
x_train_feature_selected=ch.fit_transform(x_train_tfidf, y_train)
x_test_chi_selected = ch.transform(x_validation_tfidf)

from sklearn import linear_model

clf = linear_model.LogisticRegression(multi_class='multinomial',solver = 'newton-cg')
clf.fit(x_train_feature_selected, y_train)
score = clf.score(x_test_chi_selected, y_validation)
score

0.6501538461538462

### 5-3 Cross-validation <a name="cv"></a>

In [None]:
from sklearn.model_selection import KFold, cross_val_score

#rf = RandomForestClassifier(n_jobs=-1)
k_fold = KFold(n_splits=3)
cross_val_score(clf, x_train_chi2_selected, y_train, cv=k_fold, scoring='accuracy', n_jobs=-1)

--------------------------------

# 6.Prediction on test data<a name="test"></a>

In [7]:
#--------------------------------------
## Reading the test file into dataframe
#--------------------------------------


test=pd.read_csv("test_data.csv", sep=',')

In [9]:
#--------------------------------------------------------------------
## Cleaning the test data as per the cleaning technique of train data
#--------------------------------------------------------------------

test['text']=test.swifter.apply(lambda x:pre_process(x['text']),axis=1)

HBox(children=(IntProgress(value=0, description='Pandas Apply', max=50000, style=ProgressStyle(description_wid…




In [10]:
#--------------------------------------------------------------------
## Transforming the text into vector tfidf vectorizer with chi-sqaure
#--------------------------------------------------------------------


test_matrix=  tvec.transform(test['text'])
test_matrix = ch.transform(test_matrix)

In [11]:
#---------------------------------------------------------------------
## predicting the labels, storing it as label column in test dataframe
#---------------------------------------------------------------------

test['label'] = pd.DataFrame(clf.predict(test_matrix))

In [12]:
#-----------------------------------------------------------
## dropping all other columns keeping only test_id and label
#-----------------------------------------------------------

test=test[['test_id','label']]

############################################################

#--------------------------------
#Converting the dataframe to csv
#--------------------------------

test.to_csv('predict_label.csv',index=False)

Unnamed: 0,test_id,label
0,test_1,2
1,test_2,4
2,test_3,2
3,test_4,5
4,test_5,5


# References

.[1] https://drive.google.com/file/d/0B1yuv8YaUVlZZ1RzMFJmc1ZsQmM/view                       
[2] https://github.com/tthustla/twitter_sentiment_analysis_part8/blob/master/Capstone_part4-Copy6.ipynb