In [1]:
def Jaccard(str1, str2): 
    ## Jaccard score function, given by kaggle competition main page 

    a = set(str1.lower().split()) 
    b = set(str2.lower().split()) 
    c = a.intersection(b) 
    return float(len(c)) / (len(a) + len(b) - len(c)) 

In [2]:
import pandas as pd 
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

## Read in the data files
train = pd.read_csv('train.csv') 
test = pd.read_csv('test.csv') 


In [3]:
## Drop the null values 
train = train.dropna() 

## Divide into sentiment dataframes
positive_train = train[train.sentiment == 'positive'] 
negative_train = train[train.sentiment == 'negative'] 
neutral_train = train[train.sentiment == 'neutral'] 


In [4]:
## We are going to use sklearn countvectorize 

cv = CountVectorizer(
    ## The max_df and min_df functions remove words that occur too often or not often enough. Anything above the max_df threshold is probably a filler word like 'an' or 'the', etc. Anything below the word count of min_df is most likely a typo, and is not needed for the dataset. We will later assign words that fall outside these thresholds with a value of 0 
    max_df = 0.95, min_df = 2, max_features = 10000, stop_words = 'english'
    ) 

## Creates the total dictionary of words used in the tweets 
train_cv = cv.fit_transform(train.text) 

## Converts document to see word distribution for all the words 
positive = cv.transform(positive_train.text) 
negative = cv.transform(negative_train.text) 
neutral = cv.transform(neutral_train.text) 

## Word distribution in a dataframe
positive_df = pd.DataFrame(positive.toarray(), columns = cv.get_feature_names()) 
negative_df = pd.DataFrame(negative.toarray(), columns = cv.get_feature_names()) 
neutral_df = pd.DataFrame(neutral.toarray(), columns = cv.get_feature_names()) 




positive_words = {} 
negative_words = {} 
neutral_words = {} 

## We want to see each word and see how often it appears in the tweet sentiment 
for feature in cv.get_feature_names(): ## Total dictionary 

    ## Find how often each word appears in the dataframe, divide it by the total number of words to get the weight proportion
    positive_words[feature] = positive_df[feature].sum() / positive_train.shape[0] 
    negative_words[feature] = negative_df[feature].sum() / negative_train.shape[0] 
    neutral_words[feature] = neutral_df[feature].sum() / neutral_train.shape[0]





In [5]:
## To further distinguish the words in the tweets from each other, the author of this method proposed that we subtract the portion the word appears in its own sentiment from the portion of the time it appears in other sentiments 

positive_words_adjusted = {} 
negative_words_adjusted = {} 
neutral_words_adjusted = {} 

for key, value in negative_words.items(): 
    negative_words_adjusted[key] = negative_words[key] - (neutral_words[key] + positive_words[key] ) 
for key, value in positive_words.items(): 
    positive_words_adjusted[key] = positive_words[key] - (neutral_words[key] + negative_words[key]) 
for key, value in neutral_words.items(): 
    neutral_words_adjusted[key] = neutral_words[key] - (negative_words[key] + positive_words[key]) 

## This concludes the end of pre-processing the weights of the individual words in the tweets. 

In [6]:


def Prediction(text, sentiment): 
    neutral_text = text
    text = text.lower().split() 
    #print(text)
    if sentiment == 'neutral' or len(text) <= 3: 
        return neutral_text 
        
    elif sentiment == 'positive': 
        dictionary = positive_words_adjusted 
    elif sentiment == 'negative': 
        dictionary = negative_words_adjusted 
    

    total_weight = {}
    
    ## Find all the possible start locations for the string
    for start in range(len(text)): 
        
        ## Find all the possible end locations for the string 
        for end in range(start, len(text)): 
            ## Returns a list of the possible phrase 
            subset = (text[start:end + 1])
             
            b = 0
            for word in subset: 
                try: 
                    b = b + dictionary[word] 
                   
                     
                except KeyError: 
                    b = b + 0
            total_weight[b] = subset 
    
    final_weight = {} 
    weights = -1000
    largest_substring = []
    for key, value in total_weight.items(): 
        if key > weights: 
            weights = key 
            largest_substring = value 
    return ' '.join(largest_substring)


        

In [7]:
output = [] 

for index, rows in train.iterrows(): 
    output.append(Prediction(rows.text, rows.sentiment))

train['output'] = output

## The first output column wasn't flattened while the rest were. So it was quickly fixed here
train.output.iloc[1] = ' '.join(train.output.iloc[0])[0]

In [8]:
train.head() 
## Calculate the Jaccard score for the training data 
jaccard = []
for index, row in train.iterrows(): 
    jaccard.append(Jaccard(row.selected_text, row.output))
train['jaccard'] = jaccard

In [9]:
print(train.jaccard.mean() )

0.5841617068218866


In [10]:
## Calculate the Output for the test data 

output = [] 
for index, rows in test.iterrows(): 
    output.append(Prediction(rows.text, rows.sentiment))

test['selected_text'] = output 

submission = test[['textID', 'selected_text']]



In [11]:
submission.to_csv('Simple_Submission.csv')