In [1]:
import numpy as np 
import pandas as pd

from pathlib import Path
WORK_DIR = Path.cwd()
DATA_DIR = Path.cwd()/'data'
OUT_DIR = Path.cwd()/'output'

from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, KFold, train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

from xgboost import XGBClassifier

from copy import deepcopy

import spacy
import nltk
from nltk.stem.snowball import SnowballStemmer

import warnings
warnings.filterwarnings("ignore")

## Loading Cleaned Data

In [2]:
train = pd.read_pickle(DATA_DIR/'interim/train.pkl').reset_index(drop=True)
test = pd.read_pickle(DATA_DIR/'interim/test.pkl').reset_index(drop=True)

In [3]:
array = np.array(train)
np.delete(array, [1,2,3,4,5], 1).shape

(7613, 3)

In [4]:
def generate_text_features_col(X, selected_col = 'text'):
    df = X.copy()
    df['length'] = train[selected_col].apply(lambda x: len(x))
    df['word counts'] = train[selected_col].apply(lambda x: len(x.split(' ')))
    df['capital'] = train[selected_col].apply(lambda x: sum(map(str.isupper, x)))
    df['num_hashtags'] = train['hashtags'].apply(lambda x: len(x))
    df['num_tags'] = train[selected_col].apply(lambda x: x.count('@'))

    return df

features_transformer = FunctionTransformer(generate_text_features_col, kw_args={"selected_col": 'text'})
features_transformer.fit_transform(train)
features_transformer.transform(train).head()

Unnamed: 0,id,keyword,location,text,target,keyword_processed,text_processed,hashtags,length,word counts,capital,num_hashtags,num_tags
0,1,missing,missing,our deeds are the reason of this #earthquake m...,1,miss,deed reason earthquake allah forgive,earthquake,69,13,0,10,0
1,4,missing,missing,forest fire near la ronge sask. canada,1,miss,forest fire near la ronge sask canada,missing,38,7,0,7,0
2,5,missing,missing,all residents asked to 'shelter in place' are ...,1,miss,resident ask shelter place notify officer evac...,missing,133,22,0,7,0
3,6,missing,missing,"13,000 people receive #wildfires evacuation or...",1,miss,"13,000 people receive wildfire evacuation orde...",wildfires,65,9,0,9,0
4,7,missing,missing,just got sent this photo from ruby #alaska as ...,1,miss,got send photo ruby alaska smoke wildfire pour...,alaska wildfires,88,17,0,16,0


In [5]:
class OHE_Transformer(BaseEstimator, TransformerMixin): 
    def __init__(self, categories=None, col='keyword_processed', to_array=False):
        self.categories = categories
        self.col = col
        self.to_array = to_array
        
    def fit(self, X, y=None):
        self.categories = X.loc[:, self.col].unique()
        return self

    def transform(self, X):
        ohe = pd.get_dummies(X[self.col],drop_first=True)
        ohe = ohe.T.reindex(self.categories).T.fillna(0)
        ohe.columns = ['kw_'+col for col in ohe.columns]
        df = pd.concat([X, ohe], axis=1)

        if self.to_array: 
            df = np.array(df)

        return df

ohe_kw = OHE_Transformer()
ohe_kw.fit_transform(train)
test_transform = ohe_kw.transform(test)

In [6]:
class Text_Vectorizer(BaseEstimator, TransformerMixin): 
    def __init__(self, col = 'text_processed', tfidf = False, **kwargs):
        self.col = col
        self.tfidf = tfidf
        
        if self.tfidf:
            self.vectorizer = TfidfVectorizer(**kwargs)
        else:
            self.vectorizer = CountVectorizer(**kwargs)
                
    def fit(self, X, y=None):
        self.vectorizer.fit(X[self.col])
        self.vocab = self.vectorizer.get_feature_names()
        return self

    def transform(self, X):
        if self.tfidf:
            vectorizer = TfidfVectorizer(vocabulary = self.vocab)
        else:
            vectorizer = CountVectorizer(vocabulary = self.vocab)
        words = vectorizer.fit_transform(X[self.col]).toarray()
        words_df = pd.DataFrame(words, 
                                columns=['body_'+col for col in vectorizer.get_feature_names()],
                                index=X.index)

        merged = pd.concat([X, words_df], axis=1)
        return merged


In [7]:
def select_features(X, selected_cols = None, to_array=False):
    df = X.copy()
    if selected_cols is None: 
        selected_cols = df.columns
        
    initial_cols = ['id', 'keyword', 'location', 'text', 'target', 'keyword_processed',
                    'text_processed', 'hashtags']
    select_cols = [col for col in selected_cols if col not in initial_cols]
    df = df[select_cols]
    
    if to_array:
        df = np.array(df)

    return df

feature_selector = FunctionTransformer(select_features, kw_args={"to_array": False})
feature_selector.fit_transform(train)
feature_selector.transform(test_transform)

Unnamed: 0,kw_miss,kw_ablaze,kw_accident,kw_aftershock,kw_airplaneaccident,kw_ambulance,kw_annihilate,kw_annihilation,kw_apocalypse,kw_armageddon,...,kw_violentstorm,kw_volcano,kw_warzone,kw_weapon,kw_whirlwind,kw_wildfire,kw_windstorm,kw_wound,kw_wreck,kw_wreckage
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3258,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3259,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3260,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3261,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
class WordVector(BaseEstimator, TransformerMixin): 
    def __init__(self, col = 'text', spacy_ver = None, num_components=None):
        self.col = col
        self.spacy = spacy_ver
        self.pca = None 
        self.num_components = num_components 

        if self.spacy is None:
            nlp = spacy.load('en_core_web_sm')
            self.spacy = nlp
                
    def fit(self, X, y=None):
        self.pca = None
        return self

    def transform(self, X):
        text_vector = X[self.col].apply(lambda x: self.spacy(x).vector)
        array = np.array([list(arr) for arr in list(text_vector)])
        
        if self.num_components is not None: 
            if self.pca is not None: 
                pca = self.pca 
                array = pca.transform(array)

            else:
                pca = PCA(n_components=self.num_components)
                array = pca.fit_transform(array)

            self.pca = pca
            self.explained_variance_ratio_ = pca.explained_variance_ratio_
        comp_df = pd.DataFrame(array, columns = [f'comp_{i+1}' for i in range(array.shape[1])])
        merged = pd.concat([X, comp_df], axis=1)
        return merged
    
    
class WordVector_Small(BaseEstimator, TransformerMixin): 
    def __init__(self, selected_columns = 'text_processed', spacy_ver = None):
        self.col = selected_columns
        self.spacy = spacy_ver
        self.pca = None 

        if self.spacy is None:
            nlp = spacy.load('en_core_web_sm')
            self.spacy = nlp
                
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        text_vector = X['text_processed'].apply(lambda x: self.spacy(x).vector)
        array = np.array([list(arr) for arr in list(text_vector)])
        comp_df = pd.DataFrame(array, columns = [f'comp_{i+1}' for i in range(array.shape[1])])
        merged = pd.concat([X, comp_df], axis=1)
        return merged



In [9]:
nlp = spacy.load('en_core_web_sm')
word_vec = WordVector(spacy_ver = nlp, num_components=50)

train_vec = word_vec.fit_transform(train)
test_vec = word_vec.transform(test)

In [10]:
train_vec.to_pickle(DATA_DIR/'interim/train_word_vector.pkl')
test_vec.to_pickle(DATA_DIR/'interim/test_word_vector.pkl')

#### Choosing Only Top Vocabs

In [11]:
def calc_prevalence(df): 
    features = df.drop('target', axis=1)
    active = np.sum(np.array(features) * train.target.to_numpy().reshape(-1,1), axis=0)
    word_counts = np.sum(np.array(features), axis=0)
    series = pd.Series(active/word_counts, index=features.columns)
    return series.sort_values(ascending=False)


def export_high_prevalence(series, threshold):
    return list(prevalence.index[(prevalence>(1-threshold))]) + list(prevalence.index[(prevalence<threshold)])


In [12]:
count = CountVectorizer(ngram_range=(1,1), min_df=30)
vectorized = count.fit_transform(train['text_processed'])
vector = pd.DataFrame(np.c_[vectorized.toarray(), np.array(train.target)], columns = count.get_feature_names()+['target'])

prevalence = calc_prevalence(vector)
vocab_1 = export_high_prevalence(prevalence, 0.25)

In [13]:
count = CountVectorizer(ngram_range=(2,2), min_df=15)
vectorized = count.fit_transform(train['text_processed'])
vector = pd.DataFrame(np.c_[vectorized.toarray(), np.array(train.target)], columns = count.get_feature_names()+['target'])

prevalence = calc_prevalence(vector)
vocab_2 = export_high_prevalence(prevalence, 0.25)

In [14]:
len(vocab_1)

177

#### Building Pipeline

In [38]:
vec = Text_Vectorizer(tfidf=False, min_df=50,  ngram_range=(1,2))
vec.fit_transform(X_train)

Unnamed: 0,id,keyword,location,text,keyword_processed,text_processed,hashtags,comp_1,comp_2,comp_3,...,body_way,body_weapon,body_wildfire,body_woman,body_work,body_world,body_wound,body_wreck,body_year,body_youtube
6445,9220,suicide bombing,USA,turkish troops killed in kurdish militant 'sui...,suicidebombing,turkish troop kill kurdish militant suicide at...,missing,-1.163644,0.863250,-0.026014,...,0,0,0,0,0,0,0,0,0,0
3870,5502,flames,Fairy Tail!,@aisumage @akumareisu --just between gray and ...,flame,@aisumage @akumareisu --just gray ophelia red ...,missing,-0.776185,-0.118698,-0.562627,...,0,0,0,0,0,0,0,0,0,0
2431,3493,derailed,United Kingdom,#tubestrike derailed you? our #robertwelch cut...,derail,tubestrike derail robertwelch cutlery offer tr...,robertwelch tubestrike,-0.038096,-0.487312,0.670012,...,0,0,0,0,0,0,0,0,0,0
5355,7643,pandemonium,California,truly a scene of chaos unprecedented in frenzy...,pandemonium,truly scene chaos unprecedented frenzy pandemo...,missing,0.519658,-0.020307,-0.140782,...,0,0,0,0,0,0,0,0,0,0
6496,9288,sunk,missing,everything has sunk in except the fact that i ...,sink,sink fact actually move state colorado tomorro...,missing,0.938073,-0.632953,-0.397866,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4060,5769,forest fires,missing,#heartdisease u.s. forest service says spendin...,forestfire,heartdisease u.s forest service say spend half...,heartdisease,-0.438908,0.643492,-0.400172,...,0,0,0,0,0,0,0,0,0,0
1346,1945,burning buildings,"Seattle, WA",i will never support looting or the burning of...,burnbuilding,support looting burning building see people fi...,missing,0.029256,-1.224668,-0.205016,...,0,0,0,0,0,0,0,0,0,0
3454,4940,exploded,"Oakland, Ca",holy crap @kingmyth1999 my phone just exploded...,explode,holy crap @kingmyth1999 phone explode haha,missing,0.563763,-0.216649,0.980248,...,0,0,0,0,0,0,0,0,0,0
7533,10771,wreckage,Mumbai,wreckage 'conclusively confirmed' as from mh37...,wreckage,wreckage conclusively confirm mh370 malaysia pm,missing,0.332373,-0.087433,-0.313077,...,0,0,0,0,0,0,0,0,0,0


In [39]:
pipe = Pipeline([
                 ('extra_features', FunctionTransformer(generate_text_features_col, kw_args={"selected_col": 'text'})),
                 ('ohe', OHE_Transformer()), 
                 #('vectorize1', Text_Vectorizer(sublinear_tf=True, tfidf=True, max_features=60000, min_df=1, norm='l2',  ngram_range=(1,2))),
                 ('vectorize2', Text_Vectorizer(tfidf=False, min_df=10,  ngram_range=(1,2))),
                 ('feature_selector', FunctionTransformer(select_features, kw_args={"to_array": False})), 
                 #('standard_scaler', StandardScaler()), 
                 #('pca', PCA(n_components=1000))
                  ])

In [36]:
shortpipe= deepcopy(pipe)
shortpipe.steps.pop(-1)
shortpipe.steps.pop(-1)
transform_train = shortpipe.fit_transform(X_train)
transform_test = shortpipe.transform(test)

## Model Training

In [43]:
lr_pipe = deepcopy(pipe)
lr_pipe.steps.append(['logistic',  LogisticRegression(C=3, penalty='l1', solver='liblinear')])
lr_pipe.fit(X_train, y_train)
lr_pipe.score(X_test, y_test)

0.7984241628365069

In [42]:
cv = KFold(n_splits=4, random_state=123)
k_fold = cross_val_score(lr_pipe, X_test, y_test, cv=cv)
k_fold.mean()

0.7491797900262467

In [23]:
lr_pipe.score(X_train, y_train)

0.8848932676518884

In [55]:
params = {'alpha': 1,
 'gamma': 1,
 'learning_rate': 0.4,
 'max_depth': 15,
 'min_child_weight': 1}

In [56]:
xg_pipe = deepcopy(pipe)
xg_pipe.steps.append(['xgboost',  XGBClassifier(objective='binary:logistic', eval_metric ='error', **params)])
xg_pipe.fit(X_train, y_train)
xg_pipe.score(X_test, y_test)

0.7248850952068286

In [53]:
cv = KFold(n_splits=4, random_state=123)
k_fold = cross_val_score(xg_pipe, X_test, y_test, cv=cv)
k_fold.mean()

0.5771498135101533

In [54]:
xg_pipe.score(X_train, y_train)

0.5686371100164204

In [None]:
xg_pipe = Pipeline([('extra_features', FunctionTransformer(generate_text_features_col, kw_args={"selected_col": 'text'})),
                     ('ohe', OHE_Transformer()), 
                     ('vectorize1', Text_Vectorizer(min_df=40, tfidf = False, ngram_range=(1,1))),
                     ('vectorize2', Text_Vectorizer(min_df=20, tfidf = False, ngram_range=(2,2))),
                     ('feature_selector', FunctionTransformer(select_features, kw_args={"to_array": False})), 
                     ('standard_scaler', StandardScaler())])


X_train_xg = xg_pipe.fit_transform(X_train)
X_test_xg = xg_pipe.transform(X_test)

In [None]:
xg_params =  {"learning_rate"    : [0.05, 0.15, 0.30 ],
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 2, 5],
        'max_depth': [3, 6, 12], 
        'alpha': [1, 10, 100]
        }

In [None]:
#xg_pipe.steps.append(['xgboost',  XGBClassifier(objective='binary:logistic', eval_metric ='error')])
search = GridSearchCV(XGBClassifier(objective='binary:logistic', eval_metric ='error'), xg_params, cv=3, n_jobs=6, verbose=3, refit=True)
search.fit(X_train_xg, y_train)

In [None]:
pd.DataFrame(zip(test_vec.id, lr_pipe.predict(test_vec)), columns=['id', 'target']).set_index('id').to_csv(OUT_DIR/'logistic_3.csv')

In [None]:
pd.DataFrame(zip(test_vec.id, xg_pipe.predict(test_vec)), columns=['id', 'target']).set_index('id').to_csv(OUT_DIR/'xgboost_3.csv')

#### Gridsearch

In [None]:
param_grid = {
    'pca__n_components': [5, 15, 30, 45, 64],
    'logistic__C': np.logspace(-4, 4, 4),
}

## Model Evaluation

In [None]:
confusion_matrix(y_test, lr_pipe.predict(X_test), normalize='true')

#### Shap Values

In [None]:
shortpipe= deepcopy(pipe)
shortpipe.steps.pop(-1)

In [None]:
import shap

explainer = shap.Explainer(xg_pipe.steps[-1][-1])
shap_values = explainer(shortpipe.fit_transform(X_train))

In [None]:
shap.plots.bar(shap_values)