## Data and pre-processing Investigation
### Load Data

In [6]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

train_data_raw = pd.read_csv("./data/train.csv", encoding="ISO-8859-1") #utf-8 doesn't play nice
test_data_raw = pd.read_csv("./data/test.csv", encoding="ISO-8859-1")
disaster_type_none_label = 'none'
train_data = train_data_raw.replace(np.nan, disaster_type_none_label, regex=True)
train_data.drop_duplicates(subset=["text", "disaster", "disaster_type"], keep="first")
train_data.head()



Unnamed: 0,tweetid,text,disaster_type,disaster,Unnamed: 4
0,10001,@TheEllenShow Please check into Salt River hor...,none,0,none
1,10002,"As for the hurricane, it's already category 1 ...",hurricane,1,none
2,10003,So it looks like my @SoundCloud profile shall ...,none,0,none
3,10004,@SushmaSwaraj Am sure background check of the ...,none,0,none
4,10005,Open forex detonation indicator is irretrievab...,none,0,none


### Train Test split

In [3]:
from sklearn.model_selection import train_test_split
train, valid = train_test_split(train_data, test_size=0.25, random_state=0)
train.head()


Unnamed: 0,tweetid,text,disaster_type,disaster,Unnamed: 4
1308,11309,ladies and gentlemen meet your new prime minis...,none,0,none
12427,22428,What if every 5000 wins in ranked play gave yo...,none,0,none
11035,21036,Fair enough we have two of the best attacking ...,none,0,none
1580,11581,Should I go outside after an #earthquake?,earthquake,1,none
4121,14122,I liked a @YouTube video https://t.co/vJSOOgoN...,none,0,none


In [58]:
import contractions
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

is_printable = set(string.printable)
def preprocessor(string):
    w_slash_to_with = re.sub(r"w/", "with", string)
    urlless = re.sub(r"http\S+", " ", w_slash_to_with)
    html_encoding_removal = re.sub(r"(&\d\d\d\d;)|(&x....;)|(&\w{2,4});", "", urlless)
    numericOrdinalless = re.sub(r"\d+(st|nd|rd|th)", " ", html_encoding_removal)
    numberless = re.sub(r"\d+", " ", numericOrdinalless)
    contractionless = contractions.fix(numberless)
    only_alpha_whitespace = re.sub(r"[^a-zA-Z\s]", " ", contractionless)
    return string

tweet_tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
custom_stop_words = [
    "debbie",
    "australia",
    "queensland"
]
def tokenizer(text):
    words = tweet_tokenizer.tokenize(text)
    u_to_you = ["you" if w == "u" else w for w in words]
    custom_stop_wordless = [w for w in u_to_you if w not in custom_stop_words]
    stop_wordless = [w for w in custom_stop_wordless if w not in stop_words]
    return words

with open('./output/output.txt', 'w') as f:
    for item in train["text"]:
        f.write(" ".join(tokenizer(preprocessor(item))) + "\n")
        
foo = CountVectorizer(strip_accents='unicode', analyzer='word', lowercase=True,
                      preprocessor=preprocessor, tokenizer=tokenizer)
foo_fit = foo.fit_transform(train["text"])

word_list = foo.get_feature_names()   
count_list = foo_fit.toarray().sum(axis=0)
print(f'Vocabulary Size: {len(word_list)}')
sorted(zip(word_list, count_list), key=lambda item: item[1], reverse=True)

Vocabulary Size: 26395


[('the', 5862),
 ('.', 4522),
 ('and', 4467),
 ('to', 3171),
 ('i', 3056),
 ('a', 2899),
 ('of', 2233),
 ('?', 2143),
 ('in', 2110),
 (',', 2050),
 ('!', 2012),
 (':', 1607),
 ('is', 1562),
 ('my', 1490),
 ('you', 1391),
 ('for', 1378),
 ('on', 1185),
 ('...', 1148),
 ('that', 1144),
 ('it', 1065),
 ('\x89', 968),
 ('just', 906),
 ("'", 902),
 ('-', 893),
 ('with', 853),
 ('earthquake', 806),
 ('was', 780),
 ('me', 758),
 ('"', 750),
 ('this', 743),
 ('at', 690),
 ('be', 680),
 ('are', 639),
 ('÷', 638),
 ('all', 616),
 ('have', 585),
 ('so', 581),
 ("i'm", 549),
 ('like', 539),
 ('from', 532),
 ('but', 528),
 ('your', 525),
 ('an', 517),
 ('out', 512),
 ('not', 508),
 ('up', 494),
 ('by', 484),
 ('debbie', 460),
 ('if', 460),
 ('we', 440),
 ('cyclone', 433),
 ('now', 427),
 ('get', 424),
 ('(', 421),
 (')', 413),
 ('û_', 392),
 ('&', 381),
 ('as', 377),
 ('about', 375),
 ('one', 371),
 ('will', 368),
 ('new', 363),
 ("it's", 359),
 ('_ù', 356),
 ('flood', 342),
 ('how', 330),
 ('when'

In [48]:
from nltk import pos_tag, ne_chunk
train["tokenized"] = train.apply(lambda r: tokenizer(preprocessor(r["text"])), axis=1)


In [None]:
def named_entity_extraction(r):
    return ne_chunk(pos_tag(r["tokenized"]))
