In [1]:
import os
import subprocess
import logging
from tqdm import tqdm
import pandas as pd
import numpy as np
import plotly.graph_objects as go

In [2]:
from sklearn.preprocessing import OneHotEncoder

In [3]:
# Load data.
train = pd.read_csv('train.csv', encoding = 'utf8')
test = pd.read_csv('test.csv', encoding = 'utf8')
print(
    'train data shape: {}\n\
    train data description:\n{}\n\
    test data shape: {}\n\
    test data description:\n{}'.format(
        train.shape, train.describe(),
        test.shape, test.describe()
    )
)
print(train.head())

train data shape: (7613, 5)
    train data description:
                 id      target
count   7613.000000  7613.00000
mean    5441.934848     0.42966
std     3137.116090     0.49506
min        1.000000     0.00000
25%     2734.000000     0.00000
50%     5408.000000     0.00000
75%     8146.000000     1.00000
max    10873.000000     1.00000
    test data shape: (3263, 4)
    test data description:
                 id
count   3263.000000
mean    5427.152927
std     3146.427221
min        0.000000
25%     2683.000000
50%     5500.000000
75%     8176.000000
max    10875.000000
   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got 

In [4]:
len(set(train.keyword.values))

222

In [5]:
sum(-pd.isnull(train.keyword))

7552

In [6]:
len(set(train.location.values))

3342

In [7]:
sum(-pd.isnull(train.location))

5080

In [8]:
set(train.location.values)

{nan,
 'North Carolina',
 'Republic of the Philippines',
 'Frostburg',
 'EGYPT',
 'RhodeIsland',
 'Peterborough, Ontario, Canada',
 'Kuwait',
 'Atlantic Highlands, NJ',
 'Chicagoland',
 'Nevada, USA',
 'Ohio, USA',
 'Malibu/SantaFe/Winning!',
 'East Kilbride',
 'Screwston, TX',
 'Newark, NJ',
 'Jerusalem',
 '? ',
 'Colorado, USA',
 'Den Helder, Rijkswerf',
 'Free State, South Africa',
 'lagos nigeria',
 'Montana ',
 'Rome, Italy',
 'Huntsville, Alabama',
 'San Luis Obispo, CA',
 'Estados Unidos',
 'Toronto',
 'yorkshire\n',
 'ARGENTINA',
 'Konoha',
 'Michigan, USA',
 'Rocky Mountains',
 'El Dorado, Arkansas',
 'Ondo',
 'GLOBAL/WORLDWIDE',
 'Largo, MD',
 'China',
 'Orm',
 'on to the next adventure',
 'rzl ?',
 'W.I.T.S Academy',
 'Torrance, CA',
 'Kalimantan Timur, Indonesia',
 'Broomfield, CO',
 'Caracas, Venezuela.',
 'Reading UK',
 'See the barn of bleakness',
 'Ely, Cambridgeshire',
 'Oklahoma, USA',
 'New Hampshire',
 'Birmingham, UK',
 '#keepthefaith J&J',
 'Buscame EL tu Melte',


222 unique `keyword` values (including `nan`) out of 7522 non-nan values.<br>
One-hot encode it.<br><br>
On the other hand, 3342 unique `location` values out of 5080 non-nan values.<br>
Seems correct location names and meaningless location names are mixed, but not sure how to separate them.<br>
For now, just make it a column of one-zero value.

In [9]:
# Replace 'nan' with 'none'
def replaceNAN_none(df, column='keyword'):
    col_edit = df[column].values.copy()
    col_edit[pd.isnull(col_edit)] = 'none'
    col_edit = col_edit.reshape(-1, 1)
    return col_edit

train_keyword_edit = replaceNAN_none(train)
test_keyword_edit = replaceNAN_none(test)

# Set up an one-hot encoder.
enc = OneHotEncoder(handle_unknown='ignore', dtype=np.int)  # Ignore categories not present in the training data.
enc.fit(train_keyword_edit)

def replaceCol_onehot(df, column='keyword', encoder=enc):
    col_edit = replaceNAN_none(df, column)
    cols_onehot = encoder.transform(col_edit).toarray()
    
    df_new = df.drop(column, axis=1)
    df_new = pd.concat([df_new, pd.DataFrame(cols_onehot)], axis=1)
    
    return df_new

    
train = replaceCol_onehot(train)
test = replaceCol_onehot(test)

In [10]:
len(set(train.columns.values))


226

In [11]:
# Replace 'location' column and drop the original with 'id' column.
train['location_isnull'] = pd.isnull(train.location).astype('int')
test['location_isnull'] = pd.isnull(test.location).astype('int')
train.drop(['id', 'location'], axis=1, inplace=True)
test.drop(['id', 'location'], axis=1, inplace=True)

In [12]:
train.text

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object

In [13]:
# Make bag of words from tweet texts
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

import re
from bs4 import BeautifulSoup

def tweet_to_words(text):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()
    
    text = BeautifulSoup(text, "html.parser").get_text() # Remove HTML tags.
    text = re.sub(r"[,\.]", "", text)  # Remove ',' and '.'.
    text = re.sub(r"(\w?)\d+(\w?)", "\\1 thisissomenumber \\2", text)  # Replace all number words with a token text.
    text = re.sub(r"\d+", "thisissomenumber", text)  # Replace all number words with a token text.
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    words = ' '.join(words)  # Make it back into a sentence
    
    return words

In [14]:
tweet_to_words(train.text[0])

'deed reason earthquak may allah forgiv us'

In [15]:
words_train = train.text.apply(tweet_to_words)
words_test = test.text.apply(tweet_to_words)

In [16]:
words_train[0:10]

0            deed reason earthquak may allah forgiv us
1                 forest fire near la rong sask canada
2    resid ask shelter place notifi offic evacu she...
3    thisissomenumb thisissomenumb peopl receiv wil...
4    got sent photo rubi alaska smoke wildfir pour ...
5    rockyfir updat california hwi thisissomenumb t...
6    flood disast heavi rain caus flash flood stree...
7                               top hill see fire wood
8               emerg evacu happen build across street
9                             afraid tornado come area
Name: text, dtype: object

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.externals import joblib
cache_dir = 'cache/'
cache_file = 'bagOfWords.pkl'

if not os.path.exists('cache'):
    subprocess.check_call('mkdir cache', shell=True)

def extract_BoW_features(
    words_train, words_test, 
    vocabulary_size=5000,
    cache_dir=cache_dir, cache_file=cache_file
):
    """Extract Bag-of-Words for a given set of documents, already preprocessed into words."""
    
    # If cache_file is not None, try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = joblib.load(f)
            print("Read features from cache file:", cache_file)
        except:
            pass  # unable to read from cache, but that's okay
    
    # If cache is missing, then do the heavy lifting
    if cache_data is None:
        print("Make bag of words from scratch.")
        # Fit a vectorizer to training documents and use it to transform them
        # NOTE: Training documents have already been preprocessed and tokenized into words;
        #       pass in dummy functions to skip those steps, e.g. preprocessor=lambda x: x
        vectorizer = CountVectorizer(max_features=vocabulary_size)
        features_train = vectorizer.fit_transform(words_train).toarray()

        # Apply the same vectorizer to transform the test documents (ignore unknown words)
        features_test = vectorizer.transform(words_test).toarray()
        
        # NOTE: Remember to convert the features using .toarray() for a compact representation
        
        # Write to cache file for future runs (store vocabulary as well)
        if cache_file is not None:
            vocabulary = vectorizer.vocabulary_
            cache_data = dict(features_train=features_train, features_test=features_test,
                             vocabulary=vocabulary)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                joblib.dump(cache_data, f)
            print("Wrote features to cache file:", cache_file)
    else:
        # Unpack data loaded from cache file
        print("Load bag of words from cache.")
        features_train, features_test, vocabulary = (cache_data['features_train'],
                cache_data['features_test'], cache_data['vocabulary'])
    
    # Return both the extracted features as well as the vocabulary
    return features_train, features_test, vocabulary






In [18]:
bow_train, bow_test, vocabulary = extract_BoW_features(words_train, words_test)

Read features from cache file: bagOfWords.pkl
Load bag of words from cache.


In [19]:
train = pd.concat([train, pd.DataFrame(bow_train)], axis=1)
test = pd.concat([test, pd.DataFrame(bow_test)], axis=1)
train.drop('text', axis=1, inplace=True)
test.drop('text', axis=1, inplace=True)
print(train.head())
print(test.head())

   target  0  1  2  3  4  5  6  7  8  ...  4990  4991  4992  4993  4994  4995  \
0       1  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
1       1  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
2       1  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
3       1  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
4       1  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   

   4996  4997  4998  4999  
0     0     0     0     0  
1     0     0     0     0  
2     0     0     0     0  
3     0     0     0     0  
4     0     0     0     0  

[5 rows x 5224 columns]
   0     1     2     3     4     5     6     7     8     9     ...  4990  \
0     0     0     0     0     0     0     0     0     0     0  ...     0   
1     0     0     0     0     0     0     0     0     0     0  ...     0   
2     0     0     0     0     0     0     0     0     0     0  ...     0   
3     0     0   

In [20]:
# Make the target values the first column.
train_y = train['target']
train_x = train.drop('target', axis=1)
train = pd.concat([train_y, train_x], axis=1)
train.head()

Unnamed: 0,target,0,1,2,3,4,5,6,7,8,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
if not os.path.exists('processed_data'):
    subprocess.check_call('mkdir processed_data', shell=True)

train.to_csv("processed_data/train_processed.csv", header=False, index=False)
test.to_csv("processed_data/test_processed.csv", header=False, index=False)

In [22]:
print(
    "After preprocessing\n\
    train shape: {}\n\
    test shape: {}".format(
        train.shape,
        test.shape
    )
)

After preprocessing
    train shape: (7613, 5224)
    test shape: (3263, 5223)
