## Build Training set for doc topic

In [1]:
import TWB
import glob

In [2]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
#from nltk.stem import LancasterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
porter = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sparaschiakos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
english_document_data = pd.read_pickle('data/english_document_data.pkl')
stems = pd.read_pickle('data/stemmed_data.pkl')
doc_topic = pd.read_pickle('data/english_document_data_topics.pkl')


In [15]:
doc_topic.columns

Index(['health', 'logistics', 'nutrition', 'protection', 'shelter',
       'early recovery', 'education', 'emergency telecommunications',
       'food security', 'water/sanitation/hygiene',
       'camp coordination/camp management'],
      dtype='object')

In [16]:
doc_topic

Unnamed: 0,health,logistics,nutrition,protection,shelter,early recovery,education,emergency telecommunications,food security,water/sanitation/hygiene,camp coordination/camp management
0,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,False


In [17]:
print(len(doc_topic))
print(len(stems))
print(len(english_document_data))

30073
34525
30073


In [18]:
# keep only the english ones
stems_en = stems.iloc[english_document_data.index]

In [19]:
# drop dublicates and create the corpus
stems_en_noDbl = stems_en.drop_duplicates(subset='Content')
len(stems_en_noDbl.index)

6730

In [20]:
# drop duplicates of english_document_data based on content 
doc_topic_noDbl = doc_topic.loc[stems_en_noDbl.index]

In [21]:
# keep df with true in labels
df = doc_topic_noDbl.loc[
    (doc_topic_noDbl['health']==True) |
    (doc_topic_noDbl['logistics']==True) |
    (doc_topic_noDbl['nutrition']==True) |
    (doc_topic_noDbl['protection']==True) |
    (doc_topic_noDbl['shelter']==True) |
    (doc_topic_noDbl['early recovery']==True) |
    (doc_topic_noDbl['education']==True) |
    (doc_topic_noDbl['emergency telecommunications']==True) |
    (doc_topic_noDbl['food security']==True) |
    (doc_topic_noDbl['water/sanitation/hygiene']==True) |
    (doc_topic_noDbl['camp coordination/camp management']==True)]

In [26]:
labeled_docs = pd.DataFrame(index= df.index, columns=['label'])

In [27]:
# keep docs with label
for i in df.index:
    go = False
    for j in df.columns:
        if (df[j][i] == True) & (go == False):
#             labeled_docs.document[i] = df.Filename[i]
            labeled_docs.label[i] = j
            go = True

In [28]:
labeled_docs

Unnamed: 0,label
350,water/sanitation/hygiene
617,nutrition
781,health
1341,health
1343,education
1498,nutrition
1503,protection
1505,protection
1515,protection
1545,protection


## Compute the relative freqs per word in the corpus per document

In [29]:
# create the list with all the contents
texts =[]
texts = list(stems_en_noDbl.Content)
token_freq = [ TWB.common.freq(t.split(' ')) for t in texts ]
token_freq[0]

{'': 1, 'test': 87, 'thi': 86}

In [30]:
dictionary = list(set(' '.join(texts).split(' ')))
dict_idx = { w : i for (i,w) in enumerate(dictionary) }
# dict_idx

In [32]:
import numpy as np
M = np.zeros((len(texts),len(dictionary)))
for i, text in enumerate(texts):
    M[i,[dict_idx[w] for w in token_freq[i].keys()]] = np.array(list(token_freq[i].values()))/sum(token_freq[i].values())

In [33]:
# the relative freqs per word per document
labeled_set= pd.DataFrame(M, columns=dictionary,index=stems_en_noDbl.index)

In [34]:
labeled_set.head()

Unnamed: 0,Unnamed: 1,naděj,ehoeconc301,symposia,harithi,arlen,5746cb69b3cbf7ea3385dec6,unflatt,1821,teaspon,...,ilmi,7204060009,kolela,peard,jdl,f100o,dashboard,backtitl,kimani,6998
0,0.005747,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.005155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.00565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.007042,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,3.4e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
# keep cols with only digits
digit_stems = []
for col in labeled_set.columns:
    if col.isdigit():
        digit_stems.append(col)  
len(digit_stems)

10120

In [36]:
# compute and keep the amount of digit stems per doc
labeled_set['__digits'] = labeled_set[digit_stems].apply(sum,axis=1)

In [37]:
# drop digit columns
labeled_set = labeled_set.drop(columns=digit_stems)

In [38]:
# create a column for the labels for all the docs and impute as unknown
labeled_set["label"] = 'Unknown'

In [39]:
# impute the knwon labels
for i in labeled_docs.index:
        labeled_set['label'].loc[i] = labeled_docs["label"].loc[i]

In [40]:
labeled_docs['label'].unique()

array(['water/sanitation/hygiene', 'nutrition', 'health', 'education',
       'protection', 'shelter', 'logistics'], dtype=object)

In [41]:
# save the labeled set of docs 
labeled_set.to_pickle('data/labeled_set_topic.pkl')

In [50]:
count = labeled_set['label'].value_counts()
count

Unknown                     6439
health                       111
education                     64
nutrition                     44
protection                    37
water/sanitation/hygiene      17
shelter                       13
logistics                      5
Name: label, dtype: int64

In [52]:
labeled_set.loc[labeled_docs.index]

Unnamed: 0,Unnamed: 1,naděj,ehoeconc301,symposia,harithi,arlen,5746cb69b3cbf7ea3385dec6,unflatt,teaspon,arantza,...,televisión,ilmi,kolela,peard,jdl,f100o,dashboard,backtitl,kimani,__digits
350,0.002053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012320
617,0.000436,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042683
781,0.010000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010000
1341,0.012987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1343,0.000183,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027473
1498,0.000224,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027136
1503,0.001091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009815
1505,0.000936,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024345
1515,0.001299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001299
1545,0.002825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


## PCA 

In [53]:
from sklearn.datasets import fetch_mldata
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
import pandas as pd

In [55]:
# keep only features
features_doc_topics = labeled_set.drop(columns=['label'])

In [56]:
# standarize
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(features_doc_topics)
# Apply transform to both the training set and the test set.
train_img = scaler.transform(features_doc_topics)

In [57]:
from sklearn.decomposition import PCA
pca = PCA(.95)

In [58]:
pca.fit(train_img)

PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [90]:
import joblib
print(pca.n_components_)

joblib.dump(pca,'docClassif/pca_topic.joblib')

3685


['docClassif/pca_topic.joblib']

## Training Classifiers

In [60]:
from pandas.tools.plotting import parallel_coordinates
import pandas as pd
from sklearn import svm

from sklearn.metrics import jaccard_similarity_score

#from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

random_state = 100
#import xgboost as xgb

import pickle
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np
from sklearn.model_selection import cross_val_score

In [61]:
# keep all the docs with label (drop unknown)
features_to_class = features_doc_types.loc[labeled_docs.index]
test_to_class = labeled_set.label.loc[labeled_docs.index]

In [96]:
pd.DataFrame(features_to_class.columns).to_pickle('data/feature_names.pkl')

In [62]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(features_to_class, test_to_class, test_size=0.3)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)


(203, 107251) (203,)
(88, 107251) (88,)


In [82]:
pd.DataFrame(Y_train).to_pickle('Y_train_docTopic.pkl')
pd.DataFrame(Y_test).to_pickle('Y_test_docTopic.pkl')

In [64]:
# transform features from PCA
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

In [65]:
pd.DataFrame(X_train).to_pickle('X_train_docTopic.pkl')
pd.DataFrame(X_test).to_pickle('X_test_docTopic.pkl')

### Random Forest

In [66]:
# Hyperparameter Optimization for random forest
from scipy.stats import randint as sp_randint

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, make_scorer
from time import time
# initialize the classifier
clf_RF = RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=100)

In [73]:
# find the number of features for parameter optimization
num_features = X_train.shape[1]

# make scorer#
#auc_scorer = make_scorer(roc_auc_score, needs_threshold=True)#, needs_proba=True

# params
m_depths = [None] + list(range(2, 21, 3))
# specify parameters and distributions to sample from
param_dist_rf = {"max_depth": m_depths, # [None, 2, 5, 8, 11, 14, 17, 20]
                 "max_features": sp_randint(1, num_features),
                 "min_samples_split": sp_randint(2, 100),
                 "min_samples_leaf": sp_randint(1, 100),
                 "bootstrap": [True, False],
                 "criterion": ["gini", "entropy"]}

n_iter_search = 30 # was 20

# run randomized search
random_search_RF = RandomizedSearchCV(clf_RF, 
                                      param_distributions=param_dist_rf,
                                      n_iter=n_iter_search,
                                      random_state=random_state,
                                      n_jobs=-1,
                                      return_train_score=True)

start = time()
random_search_RF.fit(X_train, Y_train)
print("RandomizedSearchCV took {:.2f} seconds for {} candidates"
      " parameter settings.\n".format((time() - start), n_iter_search))

# display the dataframe without the parameters column for better rendering
display(pd.DataFrame(random_search_RF.cv_results_).drop('params', axis=1))

best_parameters_RF = pd.Series(random_search_RF.best_params_)

# display the best set of parameters
display(best_parameters_RF)

RandomizedSearchCV took 11.84 seconds for 30 candidates parameter settings.



Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_bootstrap,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,...,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.1331,0.104956,0.374384,0.374438,True,gini,8.0,80,49,12,...,0.366197,0.378788,0.378788,0.372263,0.378788,0.372263,0.003623,0.001427,0.006004,0.003076
1,0.122137,0.104918,0.374384,0.374438,True,gini,5.0,54,67,16,...,0.366197,0.378788,0.378788,0.372263,0.378788,0.372263,0.002554,0.000839,0.006004,0.003076
2,0.12364,0.103884,0.53202,0.56138,True,entropy,,281,16,62,...,0.521127,0.545455,0.560606,0.562044,0.515152,0.576642,0.004658,0.000376,0.019992,0.012741
3,0.131191,0.10341,0.374384,0.374438,True,gini,8.0,1674,94,88,...,0.366197,0.378788,0.378788,0.372263,0.378788,0.372263,0.007734,0.001383,0.006004,0.003076
4,0.240348,0.105154,0.522167,0.554173,True,entropy,11.0,1925,32,3,...,0.549296,0.55303,0.515152,0.540146,0.5,0.569343,0.013283,0.002888,0.020813,0.011947
5,0.311704,0.108487,0.512315,0.549215,False,entropy,8.0,613,5,93,...,0.507042,0.545455,0.530303,0.540146,0.5,0.562044,0.022341,0.003796,0.012815,0.009327
6,0.333884,0.110163,0.487685,0.537141,False,entropy,20.0,2098,48,67,...,0.521127,0.55303,0.469697,0.510949,0.469697,0.547445,0.088453,0.002195,0.024526,0.018661
7,0.122447,0.10355,0.374384,0.374438,False,gini,20.0,3016,81,4,...,0.366197,0.378788,0.378788,0.372263,0.378788,0.372263,0.005573,0.001478,0.006004,0.003076
8,0.123179,0.104901,0.374384,0.374438,False,entropy,17.0,1428,99,65,...,0.366197,0.378788,0.378788,0.372263,0.378788,0.372263,0.005428,0.002669,0.006004,0.003076
9,1.219187,0.107316,0.448276,0.606097,False,entropy,,3311,31,50,...,0.492958,0.621212,0.439394,0.620438,0.409091,0.576642,0.032619,0.001961,0.034973,0.02083


bootstrap              False
criterion            entropy
max_depth                  2
max_features            2993
min_samples_leaf          22
min_samples_split         27
dtype: object

In [75]:
# Train optimized random forest
model7 = RandomForestClassifier(n_estimators=10 ,
                                min_samples_split= 27,
                                min_samples_leaf=22,
                                bootstrap = True,
                                criterion = 'entropy',
                                max_depth = 2,
                                max_features = 2993,
                                n_jobs=-1,
                                random_state=100)

model7.fit(X_train, Y_train)
print('train accuracy: '+str(jaccard_similarity_score(model7.predict(X_train), Y_train)))
print('test accuracy: '+str(jaccard_similarity_score(model7.predict(X_test), Y_test)))


train accuracy: 0.7389162561576355
test accuracy: 0.6022727272727273


In [91]:
joblib.dump(model7,'docClassif/rfc_topic.joblib')

['docClassif/rfc_topic.joblib']

In [76]:
from sklearn.metrics import confusion_matrix
# Compute confusion matrix train
y_true = Y_train
y_pred = model7.predict(X_train)
cm = confusion_matrix(y_true, y_pred)
print(cm)
# Compute confusion matrix train
y_true = Y_test
y_pred = model7.predict(X_test)
cm = confusion_matrix(y_true, y_pred)
print(cm)

[[45  6  0  0  1  0  0]
 [ 5 70  0  0  1  0  0]
 [ 1  2  0  0  0  0  0]
 [ 0 13  0  9  0  0  0]
 [ 1  2  0  0 22  0  0]
 [ 0  4  0  0  5  0  0]
 [ 0 10  0  2  0  0  4]]
[[ 8  4  0  0  0  0  0]
 [ 1 33  0  1  0  0  0]
 [ 0  1  0  1  0  0  0]
 [ 3 15  0  3  1  0  0]
 [ 3  0  0  0  9  0  0]
 [ 2  2  0  0  0  0  0]
 [ 0  1  0  0  0  0  0]]


In [69]:
count = Y_test.value_counts() 
count

health                      35
nutrition                   22
protection                  12
education                   12
shelter                      4
logistics                    2
water/sanitation/hygiene     1
Name: label, dtype: int64

## Predict

In [97]:
# keep docs with unknown labels
unlabeled_docs = labeled_set[labeled_set['label'] == 'Unknown']
# pca transformation
unlabeled_docs_pca = pca.transform(unlabeled_docs.drop(columns=['label']))

In [99]:
unlabeled_docs_pca

array([[-2.67847556e-05,  8.55176163e-07, -4.19160108e-05, ...,
        -2.50637343e-03,  1.19901895e-03,  8.71648851e-03],
       [-2.67618360e-05,  8.54651227e-07, -4.19015660e-05, ...,
        -2.50711314e-03,  1.19358471e-03,  8.71163039e-03],
       [-3.04034974e-05,  7.47224283e-07, -5.12432065e-05, ...,
         3.42003602e-06,  2.68822030e-04, -5.17455187e-04],
       ...,
       [-6.02401036e-05,  2.72519640e-06,  1.59021095e-04, ...,
         2.07609384e-03, -2.62988749e-06, -1.26893263e-03],
       [-3.92937932e-05,  7.60825893e-07, -3.20753298e-05, ...,
         1.40350471e-04, -1.02658951e-04, -1.17134332e-03],
       [-3.90005569e-05,  2.03594226e-07, -1.09198442e-05, ...,
         1.52688180e-04,  3.02007951e-04, -2.43142361e-04]])

In [100]:
pd.DataFrame(unlabeled_docs_pca).to_pickle('docClassif/unlabeled_docs_topics_pca.pkl')

In [101]:
# predict for the unknown labels
preds = pd.DataFrame(model7.predict(unlabeled_docs_pca), 
                     columns = ['prediction'], 
                     index=unlabeled_docs.index)

In [103]:
preds.shape

(6439, 1)

In [104]:
labeled_docs.label.shape

(291,)

In [None]:
document_topics = pd.concat([preds.rename(columns={'prediction':'label'}), labeled_docs[['label']]])

In [None]:
# save all doc types
document_types.columns=['topic']
document_types.to_pickle('docClassif/document_topics.pkl')

### XG Boost

In [78]:
import xgboost as xgb
best_params = {"eta": None,
               "min_child_weight": None,
               "gamma": None,
               "max_depth": None,
               "max_delta_step": None,
               "subsample": None,
               "colsample_bytree": None,
               "lambda": None,
               "objective": None}

number_of_iterations = {"after_tree_params": None,
                        "after_reg_params": None,
                        "after_function_param": None,
                        "after_eta": None}

In [81]:
from sklearn import preprocessing

In [None]:
transform

In [80]:
np.random.seed(random_state)

best_score = 0.0

num_iterations = 20

max_iterations = []

# get data into the correct format
xgbData = xgb.DMatrix(X_train, label=Y_train)

# prepare parameters
maxDepths = np.random.randint(3, 40, size=num_iterations)
minChildWeights = np.random.randint(0, 40, size=num_iterations)
gammas = np.random.randint(3, 50, size=num_iterations)
maxDeltaStep = np.random.randint(0, 10, size=num_iterations)

params_tupled = [(a, b, c, d) for a, b, c, d in zip(maxDepths,
                                                    minChildWeights,
                                                    gammas,
                                                    maxDeltaStep)]

# specify parameters and distributions to sample from
param_dist_XGB = {"eta": 0.3,
                  "min_child_weight": 1.0,
                  "gamma": 0.0,
                  "max_depth": 20,
                  "max_delta_step": 0.0,
                  "subsample": 0.8,
                  "colsample_bytree": 0.8,
                  "lambda": 1,
                  "objective": "binary:logistic"}

for mD, mCW ,g ,mDS in params_tupled:
    
    param_dist_XGB["max_depth"] = mD
    param_dist_XGB["min_child_weight"] = mCW
    param_dist_XGB["gamma"] = g
    param_dist_XGB["max_delta_step"] = mDS

    # get a cross validated result
    cv_XGB = xgb.cv(param_dist_XGB,
                    xgbData,
                    num_boost_round=300,
                    seed=random_state,
                    nfold=5,
                    stratified=True,
                    metrics={'auc'},
                    early_stopping_rounds=30,
                    as_pandas=True,
                    shuffle=False)
    
    # get the smallest score
    mean_auc_test_score = cv_XGB['test-auc-mean'].max()
    
    print("The highest auc score of the cross validation is {}.".format(mean_auc_test_score))
    
    if mean_auc_test_score < 0.85:
        print(" -> its params were: max_depth={}, min_child_weight={}, gamma={}, max_delta_step={}"
              .format(mD, mCW, g, mDS))
    
    max_iterations.append(cv_XGB.index.values[-1] + 1)
    
    if mean_auc_test_score > best_score:
        
        # update the smallest error
        best_score = mean_auc_test_score
        # update best parameters
        best_params["max_depth"] = mD
        best_params["min_child_weight"] = mCW
        best_params["gamma"] = g
        best_params["max_delta_step"] = mDS

number_of_iterations["after_tree_params"] = max(max_iterations)

print()
print("The optimal max_depth was found to be: {}".format(best_params["max_depth"]))
print("The optimal min_child_weight was found to be: {}".format(best_params["min_child_weight"]))
print("The optimal gamma was found to be: {}".format(best_params["gamma"]))
print("The optimal max_delta_step was found to be: {}\n".format(best_params["max_delta_step"]))

print("The best auc score for the best tree parameters is {}.\n".format(best_score))

print("The number of iterations for the train cases were {}, and the max of them is {}."
      .format(max_iterations, number_of_iterations["after_tree_params"]))

TypeError: must be real number, not str