In [7]:
#===============================================================================
#               Read the Input Training and Test File           
#===============================================================================
import pandas as pd
train = pd.read_csv('tweets_file_dmy.txt', skiprows = 1, names= ["ItemID", "Sentiment", "SentimentSource", "SentimentText"])
test = pd.read_csv('tweets_file_test.txt', skiprows = 1, names= ["ItemID", "Sentiment", "SentimentSource", "SentimentText"])


In [10]:
#===============================================================================
#                         Data Cleaning                                   
#===============================================================================
import numpy as np
import re
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords

stop = stopwords.words('english')

from nltk.stem.porter import *
stemmer = PorterStemmer()

def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
    return input_txt

#Data Clean and tokenize function
def clean_data(text):
    text['clean_tweet'] = np.vectorize(remove_pattern)(text['SentimentText'], "@[\w]*")
    text['clean_tweet'] = text['clean_tweet'].apply(lambda x: x.lower())
    text['clean_tweet'] = text['clean_tweet'].str.replace("[^a-z0-9#]", " ")
    text['tokenized_tweet'] = text['clean_tweet'].apply(lambda x: x.split())
    text['tokenized_tweet'] = text['tokenized_tweet'].apply(lambda x: [i for i in x if i not in stop])
    text['tokenized_tweet'] = text['tokenized_tweet'].apply(lambda x: [stemmer.stem(i) for i in x])
    text['tokenized_tweet'] = text['tokenized_tweet'].apply(lambda x: ' '.join([w for w in x]))
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vipul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
#Cleaning the train and test data
cleaned_train = clean_data(train)
cleaned_test = clean_data(test)
test_target = cleaned_data_test['Sentiment']

In [13]:
print(cleaned_train['tokenized_tweet'])

0                                           sad apl friend
1                                    miss new moon trailer
2                                         omg alreadi 7 30
3        omgaga im sooo im gunna cri dentist sinc 11 su...
4                                        think mi bf cheat
                               ...                        
89984    gnome hat problem finish size pointi top mama ...
89985    saw linn bakeri thought veggi friendli worri f...
89986                                           would love
89987                                                 evid
89988    spine thing sound good back exercis fun best l...
Name: tokenized_tweet, Length: 89989, dtype: object


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=10, stop_words='english')

# TF-IDF feature matrix
tfidf = tfidf_vectorizer.fit_transform(cleaned_train['tokenized_tweet']).toarray()
tfidf_test = tfidf_vectorizer.transform(cleaned_test['tokenized_tweet']).toarray()

In [23]:
temp = tfidf_vectorizer.vocabulary_

In [26]:
cleaned_train

Unnamed: 0,ItemID,Sentiment,SentimentSource,SentimentText,tokenized_tweet,clean_tweet
0,1,0,Sentiment140,is so sad for my APL frie...,sad apl friend,is so sad for my apl frie...
1,2,0,Sentiment140,I missed the New Moon trail...,miss new moon trailer,i missed the new moon trail...
2,3,1,Sentiment140,omg its already 7:30 :O,omg alreadi 7 30,omg its already 7 30 o
3,4,0,Sentiment140,.. Omgaga. Im sooo im gunna CRy. I'...,omgaga im sooo im gunna cri dentist sinc 11 su...,omgaga im sooo im gunna cry i ...
4,5,0,Sentiment140,i think mi bf is cheating on me!!! ...,think mi bf cheat,i think mi bf is cheating on me ...
...,...,...,...,...,...,...
89984,89996,1,Sentiment140,@clevercatsknit Re: gnome hat. Was the problem...,gnome hat problem finish size pointi top mama ...,re gnome hat was the problem the finished s...
89985,89997,1,Sentiment140,@clevercatsknit Saw Linnes Bakery but thought ...,saw linn bakeri thought veggi friendli worri f...,saw linnes bakery but thought it not too vegg...
89986,89998,1,Sentiment140,@cleverdaisies I would LOVE to!!!,would love,i would love to
89987,89999,0,Sentiment140,@cleverick evidently not,evid,evidently not


In [30]:
X = tfidf
train_target = cleaned_data['Sentiment']

In [34]:
#Functions for sigmoid, gradient descent, weight update and accuracy calculation
def sigmoid(X, weight):
    z = np.dot(X, weight)
    return 1 / (1 + np.exp(-z))

def gradient_descent(X, h, y):
    return np.dot(X.T, (h - y)) / y.shape[0]

def update_weight_loss(weight, learning_rate, gradient):
    return weight - learning_rate * gradient

def predict_t(x, theta):
    theta_new = theta[:, np.newaxis]
    return sigmoid(x,theta_new)

def acc_calc(actual, pred):
    predicted_class = ((pred >= 0.5) .astype(int))
    predicted_class = predicted_class.flatten()
    acc = np.mean(predicted_class == actual)
    return acc

In [35]:
#Function for running gradient descent
def run_grad(X, y):
    num_iter = 100
   
    theta = np.zeros(X.shape[1])
 
    for i in range(num_iter):
        h = sigmoid(X, theta)
        gradient = gradient_descent(X, h, y)
        theta = update_weight_loss(theta, 0.1, gradient)
    return theta

In [37]:
from numpy import array
from sklearn.model_selection import KFold

# K-fold Cross Validation
kfoldcv = KFold(10, True, 1)
bestacc = 0
theta_final = np.zeros(X.shape[1])

# Enumerate splits
for train, test in kfoldcv.split(X):
    X_train = X[train]
    X_validate = X[test]
    
    Y_train = train_target[train]
    Y_validate = train_target[test]
    
    theta_out = run_grad(X_train, Y_train)
    
    pred = predict_t(X_validate, theta_out)
    
    acc = acc_calc(Y_validate, pred)
    
    if(acc > bestaccuracy):
        theta_final = theta_out
        bestacc = acc

In [38]:
print(theta_final)
print(bestacc)

[1.17983528e-04 3.96188669e-04 9.77293871e-05 ... 7.12300904e-05
 5.67245301e-05 4.92836337e-04]
0.7033003667074119


In [45]:
test_out = predict_t(tfidf_test, theta_final)

In [43]:
acc = acc_calc(test_target, test_out)
acc

0.705

In [46]:
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix


average_precision = average_precision_score(y_test, test_out)
test_final_out = ((test_out >= 0.5) .astype(int))

#Precision Score
prec = precision_score(test_target, test_final_out)
#Recall Score
recall = recall_score(test_target, test_final_out)
conf = confusion_matrix(test_target, test_final_out)

print('Average precision-recall score: {0:0.2f}'.format(average_precision))

print('Precision score: {0:0.2f}'.format(prec))

print('Recall score: {0:0.2f}'.format(recall))

Average precision-recall score: 0.81
Precision score: 0.70
Recall score: 0.89
