In [1]:
import numpy as np
import pandas as pd
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

pd.options.mode.chained_assignment = None 

In [2]:
dir_name = 'datasets/'
df_tr = pd.read_csv(dir_name + 'train.csv')
df_te = pd.read_csv(dir_name + 'test.csv')

# Remove empty rows from training dataset
df_tr = df_tr[df_tr['text'].notna()]


In [3]:
# explore the data and render images for report

In [4]:
def jaccard(str1, str2):
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    score = float(len(c)) / ((len(a) + len(b)) - len(c))
    
    return score

def text_preprocess(text):
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\w*\d\w*', '', text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    
    return text

def proportion_calculation(dataset, f_mat):
    dictt = {}
    df = pd.DataFrame(f_mat.toarray(), columns=vectorizer.get_feature_names())
    
    for i in vectorizer.get_feature_names():
        summ = df[i].sum()
        dictt[i] = summ / len(dataset)
     
    return dictt

def weight_calculation(target, cat_1, cat_2):
    adjusted_weight = {}
    
    for key, value in target.items():
        adjusted_weight[key] = target[key] - (cat_1[key] + cat_2[key])
        
    return adjusted_weight

def create_subsets(words):
    return [words[i:j + 1] for i in range(len(words)) for j in range(i, len(words))]

def predict_selected_text(tweet_data, dictionary, tolerance):
    text = tweet_data['text']
    sentiment = tweet_data['sentiment']
    
    # if sentiment is neutral or text is less than 3 words then return whole text
    if(sentiment == 'neutral'):
        return text
    elif(len(text) < 3):
        return text
    
    word_dict = dictionary[sentiment][sentiment]
    
    selected_text = str()
    words = text.split()
    subsets = create_subsets(words)
    sorted_list = sorted(subsets, key=len)
    
    score = 0
    for i in range(len(subsets)):
        n_sum = 0
        for j in range(len(sorted_list[i])):
            if(sorted_list[i][j] in word_dict.keys()):
                n_sum += word_dict[sorted_list[i][j]]
                      
        if(n_sum > score + tolerance):
            score = n_sum
            selected_text = sorted_list[i]
                
    if(len(selected_text) == 0):
        selected_text = words
                
            
    return ' '.join(selected_text)

# calculate score for validation set
def validation(dataset, dictionaries):
    tolerance = 0.001
    
    dataset['predicted'] = str()

    for index, row in dataset.iterrows():
        predicted = predict_selected_text(row, dictionaries, tolerance)
        dataset.loc[dataset['textID'] == row['textID'],['predicted']] = predicted
        
    dataset['jaccard'] = dataset.apply(lambda x: jaccard(x.selected_text, x.predicted), axis = 1)
    print('jaccard score is: {}'.format(np.mean(dataset['jaccard'])))

In [5]:
# simple preprocessing
df_tr['text'] = df_tr['text'].apply(lambda x: text_preprocess(x))
df_tr['selected_text'] = df_tr['selected_text'].apply(lambda x: text_preprocess(x))
df_te['text'] = df_te['text'].apply(lambda x: text_preprocess(x))

# split training data
X_train, X_test = train_test_split(df_tr, test_size = 0.2, random_state = 0)

train_pos = X_train[X_train['sentiment'] == 'positive']
train_neu = X_train[X_train['sentiment'] == 'neutral']
train_neg = X_train[X_train['sentiment'] == 'negative']


all_pos = str()
all_neg = str()
all_neu = str()
for i in train_pos['text']:
    all_pos += i

for i in train_neg['text']:
    all_neg += i

for i in train_neu['text']:
    all_neu += i
    
data = [all_pos, all_neg, all_neu]

vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=50000, stop_words='english')
r = vectorizer.fit_transform(data)


features = vectorizer.get_feature_names()
index = [n for n in data]
df = pd.DataFrame(r.T.todense(), index=features, columns=index)

#print(df.rename(columns={'A': 'a', 'C': 'c'}))

new_df = df.rename(columns={all_pos: 'positive', all_neg: 'negative', all_neu: 'neutral'})
#new_df.head(n=30)

pos_df = new_df.filter(regex='pos')
neg_df = new_df.filter(regex='neg')
neu_df = new_df.filter(regex='neu')

pos_dict = pos_df.to_dict()
neg_dict = neg_df.to_dict()
neu_dict = neu_df.to_dict()

weight_dictionary = { 'positive': pos_dict, 'negative': neg_dict, 'neutral': neu_dict}
#print(weight_dictionary['positive']['positive'])
validation(X_test, weight_dictionary)
#pos_df.head()
#print(df[all_pos])

jaccard score is: 0.5809001834813189


In [6]:
X_test.head(n=30)

Unnamed: 0,textID,text,selected_text,sentiment,predicted,jaccard
20673,3391184efc,loves the nice weather and exams,loves,positive,loves the nice weather and exams,0.166667
12581,b35daf9677,okay this weather isnt cute sundress friendly ...,okay this weather isnt cute sundress friendly ...,neutral,okay this weather isnt cute sundress friendly ...,1.0
13136,06e5249859,woo hoo congratulations,congratulations,positive,woo hoo congratulations,0.333333
14013,3cd4960670,thanks got a hold of someone there who knew...,thanks,positive,thanks got a hold of someone there who knew th...,0.083333
25030,92b75314ca,got back and putting in the laundry we got in ...,got back and putting in the laundry we got in ...,neutral,got back and putting in the laundry we got in ...,1.0
19380,7438c9c09a,congrats on graduating college,congrats,positive,graduating,0.0
17226,fa042d9ad5,can you get me a sub from subway when ur on yo...,can you get me a sub from subway when ur on yo...,neutral,can you get me a sub from subway when ur on yo...,1.0
10078,a4ede54987,its the weekend but year old is grounded whic...,its the weekend but year old is grounded whic...,neutral,its the weekend but year old is grounded whic...,1.0
3440,35124a7481,its been fine for me for like a week but now ...,its been fine for me for like a week but now i...,neutral,its been fine for me for like a week but now ...,1.0
6294,e4b6c9a295,ntah i really wanna be there,wanna,positive,ntah i really wanna be there,0.166667
