In [35]:
#Prerequisites
# Charalambos Themistocleous
# Clean Memory before rerunning 
for name in dir():
    if not name.startswith('_'): del globals()[name]
#dir()

In [36]:
# libraries for dataset preparation, feature engineering, model training 
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.model_selection import GroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import decomposition, ensemble
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix, f1_score, auc, roc_curve
from scipy import interp
import pandas as pd
import numpy as np
import numpy, textblob, string, os
from keras.preprocessing import text, sequence
from keras import layers, models 

# Dataset preparation

In [38]:
# read text data
textpath = 'data/text_data.tsv'
#os.remove('data/combined_data.tsv')
speechpath= 'phone_processed.csv'
combinedpath = 'data/combined_data.tsv'

cols = ['label_t', 'speaker', 'text']
tdata = pd.read_table(textpath, header=None,names=cols)
sdata = pd.read_table(speechpath, delimiter=",")
# convert label to a numerical variable (category)
tdata['label'] = tdata.label_t.map(
    {"s": 0, "l": 1, "n": 2, "naos": 3})

  interactivity=interactivity, compiler=compiler, result=result)


## Text / NLP based features

A number of extra text based features can also be created which sometimes are helpful for improving text classification models. Some examples are:

* Word Count of the documents – total number of words in the documents
* Character Count of the documents – total number of characters in the documents
* Average Word Density of the documents – average length of the words used in the documents
* Puncutation Count in the Complete Essay – total number of punctuation marks in the documents
* Upper Case Count in the Complete Essay – total number of upper count words in the documents
* Title Word Count in the Complete Essay – total number of proper case (title) words in the documents
* Frequency distribution of Part of Speech Tags:

- Noun Count
- Verb Count
- Adjective Count
- Adverb Count
- Pronoun Count

These features are highly experimental ones and should be used according to the problem statement only.

In [41]:
tdata['char_count'] = tdata['text'].apply(len)
tdata['word_count'] = tdata['text'].apply(lambda x: len(x.split()))
tdata['char_word_ratio'] = tdata['char_count'] / (tdata['word_count']+1)
tdata['punctuation_count'] = tdata['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
tdata['title_word_count'] = tdata['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
tdata['upper_case_word_count'] = tdata['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [42]:
pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

tdata['noun_count'] = tdata['text'].apply(lambda x: check_pos_tag(x, 'noun'))
tdata['verb_count'] = tdata['text'].apply(lambda x: check_pos_tag(x, 'verb'))
tdata['adj_count'] = tdata['text'].apply(lambda x: check_pos_tag(x, 'adj'))
tdata['adv_count'] = tdata['text'].apply(lambda x: check_pos_tag(x, 'adv'))
tdata['pron_count'] = tdata['text'].apply(lambda x: check_pos_tag(x, 'pron'))

# POS Ratio
tdata['noun_verb_ratio'] = tdata['noun_count']/tdata['verb_count']
tdata['noun_adj_ratio'] = tdata['noun_count']/tdata['adj_count']
tdata['noun_adv_ratio'] = tdata['noun_count']/tdata['adv_count']
tdata['noun_pron_ratio'] = tdata['noun_count']/tdata['pron_count']
tdata['verb_adj_ratio'] = tdata['verb_count']/tdata['adj_count']
tdata['verb_adv_ratio'] = tdata['verb_count']/tdata['adv_count']
tdata['verb_pron_ratio'] = tdata['verb_count']/tdata['pron_count']
tdata['adj_adv_ratio'] = tdata['adj_count']/tdata['adv_count']
tdata['adj_pron_ratio'] = tdata['adj_count']/tdata['pron_count']
tdata['adv_pron_ratio'] = tdata['adv_count']/tdata['pron_count']

# Mean POS per word
tdata['mean_nouns'] = tdata['noun_count']/tdata['word_count']
tdata['mean_pron'] = tdata['pron_count']/tdata['word_count']
tdata['mean_verbs'] = tdata['verb_count']/tdata['word_count']
tdata['mean_adj'] = tdata['adj_count']/tdata['word_count']
tdata['mean_adv'] = tdata['adv_count']/tdata['word_count']

In [43]:
df_data = pd.merge(tdata, sdata, how='left', on=['speaker'])

In [44]:
df = df_data.replace([np.inf, -np.inf], np.nan)
df.to_csv(combinedpath)