# Import library

In [190]:
import pandas as pd
import numpy as np

# NLP
import nltk
import re
from nltk.corpus import stopwords         
from nltk.stem import PorterStemmer       
from nltk.tokenize import word_tokenize
import string
import heapq
stopwords_english = stopwords.words('english') 

# sk-learn
from sklearn.model_selection import train_test_split

#  Helper functions

In [191]:
# Helper functions

stemmer = PorterStemmer()
def preprocess(sentence):
    '''
    Preprocesses the data
    '''

    # Tokenize & lower casing
    sentence = word_tokenize(sentence.lower())

    # Stemmer & Stop words
    sentence = [stemmer.stem(i) for i in sentence if (i not in stopwords_english) and (i not in string.punctuation)]

    return sentence

# Import data

In [192]:
df = pd.read_csv('train.csv')

df['original_text'] = df['text'].copy()
df['text'] = df['text'].apply(lambda x:preprocess(x))

X = df['text']
y = df['target']
df.head(2)


# split the data into 2 sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify = y, random_state=42)

# Preprocess

In [193]:
# get rid of noise, keep top 10% words

wordfreq = {}
for sentence in X_train:
    for word in sentence:
        if word not in wordfreq:
            wordfreq[word] = 1
        else:
            wordfreq[word] += 1

# experiment and play with proportions until we see a good collections of words 
most_freq = heapq.nlargest(round(0.05*len(wordfreq)), wordfreq, key=wordfreq.get)
set_most_freq = set(most_freq)
# get rid of useless_words
set_useless_words = set(["''","'d",  "'m",  "'s", "'ve", '--', '...', '1', '10','\x96','2','3','5','``',
                        "\x89û_","'s", '1', '2','2015','3','4','5','70','16yr','\x89ûó','\x89ûò', "n't"
                        ])
set_most_freq = set_most_freq.difference(set_useless_words)
display(set_most_freq)

# keep only 10 % words
X_train = X_train.apply(lambda x: [i for i in x if i in set_most_freq])
X_test = X_test.apply(lambda x: [i for i in x if i in set_most_freq])

{"'conclus",
 "'i",
 "'ll",
 "'re",
 "'the",
 "'we",
 '..',
 '15',
 '40',
 '50',
 '60',
 '7',
 '8',
 '9',
 'abc',
 'ablaz',
 'accid',
 'account',
 'act',
 'action',
 'activ',
 'actual',
 'affect',
 'ago',
 'air',
 'aircraft',
 'airplan',
 'airport',
 'alarm',
 'allow',
 'almost',
 'alreadi',
 'also',
 'alway',
 'ambul',
 'america',
 'american',
 'amid',
 'amp',
 'angri',
 'annihil',
 'anniversari',
 'anoth',
 'answer',
 'anthrax',
 'anyon',
 'anyth',
 'apocalyps',
 'appear',
 'area',
 'armageddon',
 'armi',
 'around',
 'arson',
 'ass',
 'atom',
 'attack',
 'august',
 'australia',
 'avalanch',
 'away',
 'b',
 'babi',
 'back',
 'bad',
 'bag',
 'ball',
 'ban',
 'bang',
 'bar',
 'battl',
 'bc',
 'beach',
 'beauti',
 'becom',
 'begin',
 'behind',
 'believ',
 'best',
 'bestnaijamad',
 'better',
 'big',
 'bigger',
 'bioterror',
 'black',
 'blast',
 'blaze',
 'bleed',
 'blew',
 'blight',
 'blizzard',
 'block',
 'blood',
 'bloodi',
 'blown',
 'boat',
 'bodi',
 'bomb',
 'bomber',
 'book',
 'boy'

# log regression (bag of words)



In [194]:
# Make the columns

def make_columns(X, set_freq):
    word_dict = {}
    for token in set_freq:
        word_list = []
        for sentence in X:
            if token in sentence:
                word_list.append(1)
            else:
                word_list.append(0)
        word_dict[token] = word_list
    return word_dict

X_train_word_dict = make_columns(X_train, set_most_freq)
X_test_word_dict = make_columns(X_test, set_most_freq)

# Build the bag of words matrix
X_train_processed = pd.DataFrame(X_train_word_dict)
display(X_train_processed.head(2))

column_order = X_train_processed.columns

X_test_processed = pd.DataFrame(X_test_word_dict)
X_test_processed = X_test_processed[column_order] # make sure same column order
display(X_test_processed.head(2))

Unnamed: 0,pass,cat,today,trench,villag,bestnaijamad,fan,ur,first,50,...,old,realli,music,intern,coach,failur,like,answer,movi,deton
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,pass,cat,today,trench,villag,bestnaijamad,fan,ur,first,50,...,old,realli,music,intern,coach,failur,like,answer,movi,deton
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [195]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

clf = LogisticRegression(random_state=0)
clf.fit(X_train_processed, y_train)

y_pred_train = clf.predict(X_train_processed)
y_pred_test  = clf.predict(X_test_processed)

print(accuracy_score(y_pred_train, y_train))
print(accuracy_score(y_pred_test, y_test))

# ensure training score not too far away  from testing. Otherwise, it may be an indication of over-fitting

0.8394117647058823
0.7954635893354556


# Manual inspection, to see if it is a good model

In [196]:
# is this a good model
# The split is about 50/50, so a 82% accuracy model isnt bad!
print(y_train.value_counts())

# check answers
check_answers = df.iloc[y_test.index,:]
check_answers['prediction'] = y_pred_test
display(check_answers.head(4))

0    2909
1    2191
Name: target, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,id,keyword,location,text,target,original_text,prediction
6006,8578,screams,Sheffield/Leeds,"[agre, certain, cultur, appropri, thing, hones...",1,I agree with certain cultural appropriation th...,0
4681,6655,landslide,,"[hoodedu, fuck, better, berlatski, n't, win, f...",0,@hoodedu You fucking better Berlatsky. If I d...,0
3063,4395,earthquake,"Seattle, WA","[sure, megaquak, stori, brought, sens, panic, ...",1,Sure the #Megaquake story brought a sense of p...,0
3079,4416,electrocute,,"[danisnotonfir, n't, let, phil, help, 'll, pro...",0,@danisnotonfire don't let Phil help out he'll ...,0


In [197]:
# check what's wrong
check_wrong = check_answers[check_answers['target'] != check_answers['prediction']].reset_index()
for i in range(5):
    print('ACTUAL : ', check_wrong['target'][i], 'PREDICTION : ', check_wrong['prediction'][i])
    print(check_wrong['original_text'][i])
    print('#############################')
    
    

ACTUAL :  1 PREDICTION :  0
I agree with certain cultural appropriation things but honestly if u looked at my house it screams appropriation bc Buddhas and stuff-
#############################
ACTUAL :  1 PREDICTION :  0
Sure the #Megaquake story brought a sense of panic but the question is: will anything really change? http://t.co/9f3rDN9N3D
#############################
ACTUAL :  1 PREDICTION :  0
Learning from the Legacy of a Catastrophic Eruption http://t.co/25sY9Y295L via @newyorker
#############################
ACTUAL :  1 PREDICTION :  0
Telnet attacked from 124.13.172.40 (STREAMYX-HOME-SOUTHERN MY)
#############################
ACTUAL :  1 PREDICTION :  0
Zayn Malik &amp; Perrie Edwards End Engagement: SheÛªs Û÷DevastatedÛª http://t.co/GedOxSPpL9 http://t.co/ACZRUOrYtD
#############################


To be fair, I would have guessed some of these tweets wrong too.

# Note

There are some errors in the training data. We can see that Zayn Malik is marked as 1, but Zayn is definitely not related to the weathe catastrphy

In [198]:
df = pd.read_csv('train.csv')
df['original_text'] = df['text'].copy()


zayn = df[df['original_text'].str.contains('Zayn')].reset_index()
for i in range(3):
    print(zayn['target'][i])
    print(zayn['original_text'][i])
    print('\n\n')

0
Zayn just blew up twitter.



1
Zayn Malik &amp; Perrie Edwards End Engagement: SheÛªs Û÷DevastatedÛª http://t.co/GedOxSPpL9 http://t.co/ACZRUOrYtD



1
Me pulling over and fighting the hoes that called Zayn a terrorist  http://t.co/FY30fV0Qbx



