In [146]:
import numpy as np 
import pandas as pd

from pathlib import Path
WORK_DIR = Path.cwd()
DATA_DIR = Path.cwd()/'data'

from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

from xgboost import XGBClassifier

from copy import deepcopy

import spacy
import nltk
from nltk.stem.snowball import SnowballStemmer

import warnings
warnings.filterwarnings("ignore")

## Loading Cleaned Data

In [2]:
train = pd.read_pickle(DATA_DIR/'interim/train.pkl')
test = pd.read_pickle(DATA_DIR/'interim/test.pkl')

In [11]:
array = np.array(train)
np.delete(array, [1,2,3,4,5], 1).shape

(7613, 3)

In [79]:
def generate_text_features_col(X, selected_col = 'text'):
    df = X.copy()
    df['length'] = train[selected_col].apply(lambda x: len(x))
    df['word counts'] = train[selected_col].apply(lambda x: len(x.split(' ')))
    df['capital'] = train[selected_col].apply(lambda x: sum(map(str.isupper, x)))
    df['num_hashtags'] = train['hashtags'].apply(lambda x: len(x))
    df['num_tags'] = train[selected_col].apply(lambda x: x.count('@'))

    return df

features_transformer = FunctionTransformer(generate_text_features_col, kw_args={"selected_col": 'text'})
features_transformer.fit_transform(train)
features_transformer.transform(train).head()

Unnamed: 0,id,keyword,location,text,target,keyword_processed,text_processed,hashtags,length,word counts,capital,num_hashtags,num_tags
0,1,missing,missing,Our Deeds are the Reason of this #earthquake M...,1,miss,deed reason earthquake allah forgive,earthquake,69,13,10,10,0
1,4,missing,missing,Forest fire near La Ronge Sask. Canada,1,miss,forest fire near la ronge sask . canada,missing,38,7,5,7,0
2,5,missing,missing,All residents asked to 'shelter in place' are ...,1,miss,resident ask ' shelter place ' notify officer ...,missing,133,22,2,7,0
3,6,missing,missing,"13,000 people receive #wildfires evacuation or...",1,miss,"13,000 people receive wildfire evacuation orde...",wildfires,65,9,1,9,0
4,7,missing,missing,Just got sent this photo from Ruby #Alaska as ...,1,miss,got send photo ruby alaska smoke wildfire pour...,Alaska wildfires,88,17,3,16,0


In [91]:
class OHE_Transformer(BaseEstimator, TransformerMixin): 
    def __init__(self, categories=None, selected_col='keyword_processed', to_array=False):
        self.categories = categories
        self.col = selected_col
        self.to_array = to_array
        
    def fit(self, X, y=None):
        self.categories = X.loc[:, self.col].unique()
        return self

    def transform(self, X):
        ohe = pd.get_dummies(X[self.col],drop_first=True)
        ohe = ohe.T.reindex(self.categories).T.fillna(0)
        ohe.columns = ['kw_'+col for col in ohe.columns]
        df = pd.concat([X, ohe], axis=1)

        if self.to_array: 
            df = np.array(df)

        return df

ohe_kw = OHE_Transformer()
ohe_kw.fit_transform(train)
test_transform = ohe_kw.transform(test)

In [96]:
def select_features(X, selected_cols = None, to_array=False):
    df = X.copy()
    if selected_cols is None: 
        selected_cols = df.columns
        
    initial_cols = ['id', 'keyword', 'location', 'text', 'target', 'keyword_processed',
                    'text_processed', 'hashtags']
    select_cols = [col for col in selected_cols if col not in initial_cols]
    df = df[select_cols]
    
    if to_array:
        df = np.array(df)

    return df

feature_selector = FunctionTransformer(select_features, kw_args={"to_array": False})
feature_selector.fit_transform(train)
feature_selector.transform(test_transform)

Unnamed: 0,kw_miss,kw_ablaze,kw_accident,kw_aftershock,kw_airplaneaccident,kw_ambulance,kw_annihilate,kw_annihilation,kw_apocalypse,kw_armageddon,...,kw_violentstorm,kw_volcano,kw_warzone,kw_weapon,kw_whirlwind,kw_wildfire,kw_windstorm,kw_wound,kw_wreck,kw_wreckage
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3259,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3260,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3261,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Building Pipeline

In [142]:
pipe = Pipeline([('extra_features', FunctionTransformer(generate_text_features_col, kw_args={"selected_col": 'text'})),
                 ('ohe', OHE_Transformer()), 
                 ('feature_selector', FunctionTransformer(select_features, kw_args={"to_array": False})), 
                 ('standard_scaler', StandardScaler())])

In [137]:
pipe.fit_transform(X_train)

array([[ 1.15259136e+00,  5.29599390e-01, -9.12648131e-02, ...,
        -6.54797598e-02, -6.29005464e-02, -6.29005464e-02],
       [-9.07653300e-02,  1.12632430e-02, -4.69877014e-01, ...,
        -6.54797598e-02, -6.29005464e-02, -6.29005464e-02],
       [ 1.24140255e+00,  1.22071425e+00,  8.55265690e-01, ...,
        -6.54797598e-02, -6.29005464e-02, -6.29005464e-02],
       ...,
       [ 5.90120476e-01, -3.34294188e-01,  2.93763280e+00, ...,
        -6.54797598e-02, -6.29005464e-02, -6.29005464e-02],
       [ 1.03417644e+00,  8.75156822e-01, -2.80570914e-01, ...,
        -6.54797598e-02, -6.29005464e-02, -6.29005464e-02],
       [-1.95413789e-03,  3.56820674e-01, -2.80570914e-01, ...,
        -6.54797598e-02, -6.29005464e-02, -6.29005464e-02]])

## Model Training

In [132]:
X_train, X_test, y_train, y_test = train_test_split(train.drop('target', axis=1), train['target'], test_size=0.2, random_state=42)

In [143]:
lr_pipe = deepcopy(pipe)
lr_pipe.steps.append(['logistic',  LogisticRegression(penalty='l1', solver='liblinear')])
lr_pipe.fit(X_train, y_train)
lr_pipe.score(X_test, y_test)

0.7275114904793172

In [147]:
xg_pipe = deepcopy(pipe)
xg_pipe.steps.append(['xgboost',  XGBClassifier(objective='binary:logistic', eval_metric ='error')])
xg_pipe.fit(X_train, y_train)
xg_pipe.score(X_test, y_test)

0.7294812869336835

## Model Evaluation

In [None]:

confusion_matrix(y_test, xg_pipe.predict(X_test), normalize='true')