# Authorship Profiling
### Importing required libraries

In [None]:
import xml.etree.ElementTree as ET
import os
import pandas as pd
import nltk
# nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
# import multiprocessing as mp
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from nltk.corpus import wordnet
import pandas, xgboost, numpy, textblob, string
from sklearn.neural_network import MLPClassifier
from nltk.stem import WordNetLemmatizer
import random
from nltk.stem import WordNetLemmatizer
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm

Now let's parse the xml files given to us. For that we are creating a function so that all the files from a given path will be parsed and saved as separate tweets 

In [None]:
def parseXML(xmlFile):
    root = ET.parse(xmlFile).getroot()
    tweets = []
    for elem in root:
        for subelem in elem:
            tweets.append(subelem.text)
    return tweets

In [None]:
parsed_xmls = [] #empty list to append the dictionaries with tweet id and tweets
path = './data/xmls' #path with all the xml files

for filename in os.listdir(path):
    authorTweets = {}
    if not filename.endswith('.xml'): continue #if file is not xml skip it
    file = os.path.join(path, filename)
    authorTweets['id'] = os.path.splitext(filename)[0] #remove file extension
    authorTweets['tweets'] = parseXML(file)
    parsed_xmls.append(authorTweets) #returns lsit appended with dictionaries with id and tweet

All the parsed documents are in a list of dictionaries now. Let's preprocess the tweets and extract some features.

### Preprocessing

Here we are creating a function which takes the parsed xml as argument and returns a dictionary with tweet id, the original tweets, unigram tokens based on Bag-of-Words approach, unigram tokens based of Parts-Of-Speech approach (tokens and tags together), and the tags as features. They are returns as list of dictionaries.

In [None]:
wnl = WordNetLemmatizer()
def tokenizeRawData(parsedTweets):
    token_tweets = []
    alpha_tweets = []
    tokenised_tweets = {}
    for tweet in parsedTweets['tweets']:
        word_tokens = nltk.tokenize.word_tokenize(tweet.lower())
        token_tweets.extend(word_tokens)
    for each in token_tweets:
        if each.isalpha():
            alpha_tweets.append(each)
            
    tokenised_tweets['id'] = parsedTweets['id']
    tokenised_tweets['tweets'] = tweet
    tokenised_tweets['bow_tokens'] = [wnl.lemmatize(each) for each in alpha_tweets]
    tokenised_tweets['pos_tokens'] = nltk.pos_tag(alpha_tweets)
    tokenised_tweets['pos_tags'] = [k for _,k in tokenised_tweets['pos_tokens']]
    return(tokenised_tweets)

In [None]:
# Tokenization
tokenized_tweets = [] 
for each in parsed_xmls:
    tokenized_tweets.append(tokenizeRawData(each))

For the ease of viewing features and computational efficiency, let's change the list to a pandas dataframe

In [None]:
tweet_df = pd.DataFrame(tokenized_tweets)

After converting to dataframe, we can easily remove the stopwords from the unigrams with the bag of words approach using lambda apply function.

In [None]:
#stopwords removal from both bag of words and parts of speech
stopwords = []
with open('./data/stopwords_en.txt') as f:
    stopwords = f.read().splitlines()
    
tweet_df['bow_tokens'] = tweet_df['bow_tokens'].apply(lambda x: [item for item in x if item not in stopwords])

In [None]:
from __future__ import division
from itertools import chain
word_list = tweet_df['bow_tokens']
words = [item for sublist in word_list for item in sublist]
vocab = set(words)
lexical_diversity = len(words)/len(vocab)
print ("Vocabulary size: ",len(vocab),"\nTotal number of tokens: ", len(words), \
"\nLexical diversity: ", lexical_diversity)

As we progress, let's see if there is any words that are appearing across all the documents. Those ones add very little value to our prediction tasks. We use FreqDist function to count the frequency of these words. We are not counting how many times a word is being used by an author, but which words are being used by almost all the authors.

In [None]:
from nltk.probability import *
words_2 = list(chain.from_iterable([set(value) for value in word_list]))
fd_2 = FreqDist(words_2)
fd_2.most_common(25)

Looks like "https" is used by more than 98% of the authors. This might be the remaining of any url links which was removed in our preporcessing step. So we can definitely remove that. On an assumption that, the frequency of words follow normal distribution (which will be the case if our dataset increase by CLT), lets keep all the words used by less than 95% of the authors.

In [None]:
tweet_df['bow_tokens'] = tweet_df['bow_tokens'].apply(lambda x: [item for item in x if item != 'https'])

#new column with the count of all tokens after stopwords and most common words
tweet_df['token_count'] = tweet_df['bow_tokens'].apply(lambda words: len(words))

In [None]:
tweet_df.head()

Next, we are calculating the sentiment composite score to see if there is any significance difference in tweets by male and female authors. For that we are using SentimentIntensityAnalyzer function. We can visualise the affect later when we split the data into train and test data

In [None]:
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

tweet_df['sentiment'] = tweet_df['tweets'].apply(lambda t: sid.polarity_scores(t)['compound'])

Here we are joining all the tokens into sentences. We are creating three string features. 
1. Token string: String of all unigrams under Bag-of-Words approach
2. POS String: String of all tags under Parts-Of-Speech
3. Combined token and pos: Combined string of both Bag-of-Words tokens and Parts-Of-Speech tags.

We are going to check for which of these we get a better classification model

In [None]:
tweet_df['token_string'] = tweet_df['bow_tokens'].apply(lambda word: ' '.join(word))
tweet_df['pos_string'] = tweet_df['pos_tags'].apply(lambda word: ' '.join(word))
tweet_df['token_pos_string'] = tweet_df[['token_string', 'pos_string']].apply(lambda x: ''.join(x), axis=1)

In [None]:
tweet_df.head()

We have preprocessed all of our tweets documents including both training examples and test data. Since, we are going to create models, we need to separate the training and test data as we need labels to train and test data doesn't have that. We are loading the csv files given to us to separate the training and testing using the ids.

In [None]:
training_df = pd.read_csv("./data/train_labels.csv")
testing_df = pd.read_csv("./data/test.csv")

training_id = list(training_df['id'])
testing_id = list(testing_df['id'])
tweet_df_train = tweet_df[tweet_df['id'].isin(training_id)]
tweet_df_test = tweet_df[tweet_df['id'].isin(testing_id)]

tweet_df_train = tweet_df_train.merge(training_df[['id', 'gender']], on='id', how='left')
tweet_df_train['gender'][tweet_df_train['gender'] == 'male'] = 1
tweet_df_train['gender'][tweet_df_train['gender'] == 'female'] = 0

Now that we have the labels for training examples, let's see if the sentiment score we calculated show any significance difference for male and female.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

gender = [1, 0]
for gender in gender:
    sns.distplot(tweet_df_train[tweet_df_train['gender'] == gender]['sentiment'], label = gender, hist=False)

plt.legend()

Looks like the tweets by both category of authors follow very similar distribution. So we are not going to use them as features.

In [None]:
tweet_df_train.head()

## Model building

Let's first try with just Token string on all the seven models and save all the accuracy scores to a dataframe.

The commemnted out sections are computing accuracy for different models on count vector, tf-idf, and ngrams of the three "Token String", "POS String" and "Token POS String"

In [None]:
# accuracy_token_df = pd.DataFrame(np.nan, index=[0, 1, 2], columns=['Features', 'NaiveBayes', 'LogisticRegression', 'SupportVectorMachine', 
#                                                                 'RandomForest', 'XGBoost', 'NeuralNet'])
# accuracy_token_df['Features'] = ['CountVector', 'WordLevelTF_IDF', 'N-GramVector']

In [None]:
# def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
#     random.seed(3000)
#     # fit the training dataset on the classifier
#     classifier.fit(feature_vector_train, label)
    
#     # predict the labels on validation dataset
#     predictions = classifier.predict(feature_vector_valid)
    
#     if is_neural_net:
#         predictions = predictions.argmax(axis=-1)
    
#     return metrics.accuracy_score(predictions, valid_y)

In [None]:
# # split the dataset into training and validation datasets 
# train_x, valid_x, train_y, valid_y = model_selection.train_test_split(tweet_df_train['token_string'], tweet_df_train['gender'])

# # label encode the target variable 
# encoder = preprocessing.LabelEncoder()
# train_y = encoder.fit_transform(train_y)
# valid_y = encoder.fit_transform(valid_y)

# # create a count vectorizer object 
# count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}',  max_features=3000)
# count_vect.fit(train_x)

# # transform the training and validation data using count vectorizer object
# xtrain_count =  count_vect.transform(train_x)
# xvalid_count =  count_vect.transform(valid_x)

# # word level tf-idf
# tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}',  max_features=3000)
# tfidf_vect.fit(tweet_df_train['token_string'])
# xtrain_tfidf =  tfidf_vect.transform(train_x)
# xvalid_tfidf =  tfidf_vect.transform(valid_x)

# # ngram level tf-idf 
# tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3),  max_features=3000)
# tfidf_vect_ngram.fit(tweet_df_train['token_string'])
# xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
# xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)



In [None]:
# # Naive Bayes on Count Vectors
# accuracy1 = train_model(naive_bayes.MultinomialNB(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())

# # Naive Bayes on Word Level TF IDF Vectors
# accuracy2 = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())

# # Naive Bayes on Ngram Level TF IDF Vectors
# accuracy3 = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram.tocsc(), train_y, xvalid_tfidf_ngram.tocsc())

# accuracy_token_df['NaiveBayes'] = [accuracy1, accuracy2, accuracy3]



# # Linear Classifier on Count Vectors
# accuracy5 = train_model(linear_model.LogisticRegression(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc().tocsc())

# # Linear Classifier on Word Level TF IDF Vectors
# accuracy6 = train_model(linear_model.LogisticRegression(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())

# # Linear Classifier on Ngram Level TF IDF Vectors
# accuracy7 = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram.tocsc(), train_y, xvalid_tfidf_ngram.tocsc())

# accuracy_token_df.iloc[:,2] = [accuracy5, accuracy6, accuracy7]




# # SVM on Ngram Level TF IDF Vectors
# accuracy9 = train_model(svm.SVC(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())

# # Linear Classifier on Word Level TF IDF Vectors
# accuracy10 = train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf)

# # Linear Classifier on Ngram Level TF IDF Vectors
# accuracy11 = train_model(svm.SVC(), xtrain_tfidf_ngram.tocsc(), train_y, xvalid_tfidf_ngram.tocsc())

# accuracy_token_df.iloc[:,3] = [accuracy9, accuracy10, accuracy11]



# from sklearn import decomposition, ensemble

# # RF on Count Vectors
# accuracy13 = train_model(ensemble.RandomForestClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())

# # RF on Word Level TF IDF Vectors
# accuracy14 = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)

# # Linear Classifier on Ngram Level TF IDF Vectors
# accuracy15 = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)

# accuracy_token_df.iloc[:,4] = [accuracy13, accuracy14, accuracy15]





# # Extereme Gradient Boosting on Count Vectors
# accuracy17 = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())

# # Extereme Gradient Boosting on Word Level TF IDF Vectors
# accuracy18 = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())

# accuracy19 = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)

# accuracy_token_df.iloc[:, 5] = [accuracy17, accuracy18, accuracy19]





# # Multi-layer Perceptron on Count Vectors
# accuracy21 = train_model(MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100,), random_state=1), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())

# # Multi-layer Perceptron on Word Level TF IDF Vectors
# accuracy22 = train_model(MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100,), random_state=1), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())

# # # Multi-layer Perceptron on Character Level NGrams
# accuracy23 = train_model(MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100,), random_state=1), xtrain_tfidf_ngram.tocsc(), train_y, xvalid_tfidf_ngram.tocsc())
# # print "Multi-layer Perceptron, CharLevel Vectors: ", accuracy

# accuracy_token_df.iloc[:, 6] = [accuracy21, accuracy22, accuracy23]


Now let's try the same with POS string

In [None]:
# accuracy_pos_df = pd.DataFrame(np.nan, index=[0, 1, 2], columns=['Features', 'NaiveBayes', 'LogisticRegression', 'SupportVectorMachine', 
#                                                                 'RandomForest', 'XGBoost', 'NeuralNet'])
# accuracy_pos_df['Features'] = ['CountVector', 'WordLevelTF_IDF', 'N-GramVector']

In [None]:
# # split the dataset into training and validation datasets 
# train_x, valid_x, train_y, valid_y = model_selection.train_test_split(tweet_df_train['pos_string'], tweet_df_train['gender'])

# # label encode the target variable 
# encoder = preprocessing.LabelEncoder()
# train_y = encoder.fit_transform(train_y)
# valid_y = encoder.fit_transform(valid_y)

# # create a count vectorizer object 
# count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}',  max_features=3000)
# count_vect.fit(tweet_df_train['pos_string'])

# # transform the training and validation data using count vectorizer object
# xtrain_count =  count_vect.transform(train_x)
# xvalid_count =  count_vect.transform(valid_x)

# # word level tf-idf
# tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}',  max_features=3000)
# tfidf_vect.fit(tweet_df_train['pos_string'])
# xtrain_tfidf =  tfidf_vect.transform(train_x)
# xvalid_tfidf =  tfidf_vect.transform(valid_x)

# # ngram level tf-idf 
# tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3),  max_features=3000)
# tfidf_vect_ngram.fit(tweet_df_train['pos_string'])
# xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
# xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)


In [None]:
# # Naive Bayes on Count Vectors
# accuracy1 = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)

# # Naive Bayes on Word Level TF IDF Vectors
# accuracy2 = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)

# # Naive Bayes on Ngram Level TF IDF Vectors
# accuracy3 = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)

# accuracy_pos_df['NaiveBayes'] = [accuracy1, accuracy2, accuracy3]



# # Linear Classifier on Count Vectors
# accuracy5 = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)

# # Linear Classifier on Word Level TF IDF Vectors
# accuracy6 = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)

# # Linear Classifier on Ngram Level TF IDF Vectors
# accuracy7 = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)

# accuracy_pos_df.iloc[:,2] = [accuracy5, accuracy6, accuracy7]




# # SVM on Ngram Level TF IDF Vectors
# accuracy9 = train_model(svm.SVC(), xtrain_count, train_y, xvalid_count)

# # Linear Classifier on Word Level TF IDF Vectors
# accuracy10 = train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf)

# # Linear Classifier on Ngram Level TF IDF Vectors
# accuracy11 = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)

# accuracy_pos_df.iloc[:,3] = [accuracy9, accuracy10, accuracy11]



# from sklearn import decomposition, ensemble

# # RF on Count Vectors
# accuracy13 = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)

# # RF on Word Level TF IDF Vectors
# accuracy14 = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)

# # Linear Classifier on Ngram Level TF IDF Vectors
# accuracy15 = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)


# accuracy_pos_df.iloc[:,4] = [accuracy13, accuracy14, accuracy15]





# # Extereme Gradient Boosting on Count Vectors
# accuracy17 = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())

# # Extereme Gradient Boosting on Word Level TF IDF Vectors
# accuracy18 = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())

# accuracy19 = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)

# accuracy_pos_df.iloc[:, 5] = [accuracy17, accuracy18, accuracy19]





# # Multi-layer Perceptron on Count Vectors
# accuracy21 = train_model(MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100,), random_state=1), xtrain_count, train_y, xvalid_count)

# # Multi-layer Perceptron on Word Level TF IDF Vectors
# accuracy22 = train_model(MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100,), random_state=1), xtrain_tfidf, train_y, xvalid_tfidf)

# # # Multi-layer Perceptron on Character Level NGrams
# accuracy23 = train_model(MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100,), random_state=1), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
# # print "Multi-layer Perceptron, CharLevel Vectors: ", accuracy

# accuracy_pos_df.iloc[:, 6] = [accuracy21, accuracy22, accuracy23]

Finally let's try this on the combination of both.

In [None]:
# accuracy_comb_df = pd.DataFrame(np.nan, index=[0, 1, 2], columns=['Features', 'NaiveBayes', 'LogisticRegression', 'SupportVectorMachine', 
#                                                                 'RandomForest', 'XGBoost', 'NeuralNet'])
# accuracy_comb_df['Features'] = ['CountVector', 'WordLevelTF_IDF', 'N-GramVector']

In [None]:
# # split the dataset into training and validation datasets 
# train_x, valid_x, train_y, valid_y = model_selection.train_test_split(tweet_df_train['pos_string'], tweet_df_train['gender'])

# # label encode the target variable 
# encoder = preprocessing.LabelEncoder()
# train_y = encoder.fit_transform(train_y)
# valid_y = encoder.fit_transform(valid_y)

# # create a count vectorizer object 
# count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}',  max_features=3000)
# count_vect.fit(tweet_df_train['token_pos_string'])

# # transform the training and validation data using count vectorizer object
# xtrain_count =  count_vect.transform(train_x)
# xvalid_count =  count_vect.transform(valid_x)

# # word level tf-idf
# tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}',  max_features=3000)
# tfidf_vect.fit(tweet_df_train['token_pos_string'])
# xtrain_tfidf =  tfidf_vect.transform(train_x)
# xvalid_tfidf =  tfidf_vect.transform(valid_x)

# # ngram level tf-idf 
# tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3),  max_features=3000)
# tfidf_vect_ngram.fit(tweet_df_train['token_pos_string'])
# xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
# xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)


In [None]:
# # Naive Bayes on Count Vectors
# accuracy1 = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)

# # Naive Bayes on Word Level TF IDF Vectors
# accuracy2 = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)

# # Naive Bayes on Ngram Level TF IDF Vectors
# accuracy3 = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)

# accuracy_comb_df['NaiveBayes'] = [accuracy1, accuracy2, accuracy3]



# # Linear Classifier on Count Vectors
# accuracy5 = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)

# # Linear Classifier on Word Level TF IDF Vectors
# accuracy6 = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)

# # Linear Classifier on Ngram Level TF IDF Vectors
# accuracy7 = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)

# accuracy_comb_df.iloc[:,2] = [accuracy5, accuracy6, accuracy7]




# # SVM on Ngram Level TF IDF Vectors
# accuracy9 = train_model(svm.SVC(), xtrain_count, train_y, xvalid_count)

# # Linear Classifier on Word Level TF IDF Vectors
# accuracy10 = train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf)

# # Linear Classifier on Ngram Level TF IDF Vectors
# accuracy11 = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)

# accuracy_comb_df.iloc[:,3] = [accuracy9, accuracy10, accuracy11]



# from sklearn import decomposition, ensemble

# # RF on Count Vectors
# accuracy13 = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)

# # RF on Word Level TF IDF Vectors
# accuracy14 = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)

# # RF on Ngram Level TF IDF Vectors
# accuracy15 = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)


# accuracy_comb_df.iloc[:,4] = [accuracy13, accuracy14, accuracy15]





# # Extereme Gradient Boosting on Count Vectors
# accuracy17 = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())

# # Extereme Gradient Boosting on Word Level TF IDF Vectors
# accuracy18 = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())

# accuracy19 = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram.tocsc(), train_y, xvalid_tfidf_ngram.tocsc())

# accuracy_comb_df.iloc[:, 5] = [accuracy17, accuracy18, accuracy19]





# # Multi-layer Perceptron on Count Vectors
# accuracy21 = train_model(MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100,), random_state=1), xtrain_count, train_y, xvalid_count)

# # Multi-layer Perceptron on Word Level TF IDF Vectors
# accuracy22 = train_model(MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100,), random_state=1), xtrain_tfidf, train_y, xvalid_tfidf)

# # # Multi-layer Perceptron on Character Level NGrams
# accuracy23 = train_model(MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100,), random_state=1), xtrain_tfidf_ngram.tocsc(), train_y, xvalid_tfidf_ngram.tocsc())
# # print "Multi-layer Perceptron, CharLevel Vectors: ", accuracy

# accuracy_comb_df.iloc[:, 6] = [accuracy21, accuracy22, accuracy23]

Let's load our test label data and check the accuracy of out classifier predictions

In [None]:
testing_label_df = pd.read_csv("./data/test_labels.csv")

tweet_df_test = tweet_df_test.merge(testing_label_df[['id', 'gender']], on='id', how='left')
tweet_df_test['gender'][tweet_df_test['gender'] == 'male'] = 1
tweet_df_test['gender'][tweet_df_test['gender'] == 'female'] = 0

## Model Comparison

The following commented out cell has pandas dataframes with the accuraies calculated for token string

Now let's see how our accuracy dataframes are looking like

The one with Token string:

In [None]:
# accuracy_token_df

With just the unigram tokens from Bag-Of-Words approach, the maximum accuracy we are getting is 76.2% with Logistic Regression and tf-idf vector. Let's use that and predict our test data and see the accuracy.

In [None]:
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(tweet_df_train['gender'])
test_y = encoder.fit_transform(tweet_df_test['gender'])

tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=3000)
tfidf_vect.fit(tweet_df_train['token_string'])

# transform the training and validation data using count vectorizer object
xtrain_tfidf =  tfidf_vect.transform(tweet_df_train['token_string'])
xtest_tfidf =  tfidf_vect.transform(tweet_df_test['token_string'])

classifier = linear_model.LogisticRegression()
classifier.fit(xtrain_tfidf, train_y)
y_pred_XGB = classifier.predict(xtest_tfidf)
accuracy_score(test_y, y_pred_XGB)

We are getting an accuracy of 78.4%. 

The following commented out cell has pandas dataframes with the accuraies calculated for pos string

Now let's see what POS String has got:

In [None]:
# accuracy_pos_df

The highest is for Logistic Regression withN-Gram vector.

In [None]:
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=3000)
tfidf_vect_ngram.fit(tweet_df_train['pos_string'])

# transform the training and validation data using count vectorizer object
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(tweet_df_train['pos_string'])
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(tweet_df_test['pos_string'])

classifier = linear_model.LogisticRegression()
classifier.fit(xtrain_tfidf_ngram, train_y)
y_pred_LR = classifier.predict(xtest_tfidf_ngram)
accuracy_score(test_y, y_pred_LR)

The accuracy we are getting here is 70%. 

The following commented out cell has pandas dataframes with the accuraies calculated for the combined token and pos string

Hopefully for the combined BOW and POS String we'll get better accuracy

In [None]:
# accuracy_comb_df

For the combination model, we have NeuralNet with Count Vector as most accurate. Let's use this to predict our test data classification.


## Final Model

In [None]:
####Linear regression and naive bayes count vectors
# create a count vectorizer object 
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(tweet_df_train['gender'])
test_y = encoder.fit_transform(tweet_df_test['gender'])

count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features = 3000)
count_vect.fit(tweet_df_train['token_pos_string'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(tweet_df_train['token_pos_string'])
xtest_count =  count_vect.transform(tweet_df_test['token_pos_string'])

In [None]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(100,), random_state=1)
clf.fit(xtrain_count, train_y)
y_pred_MLP = clf.predict(xtest_count)
accuracy_score(test_y, y_pred_MLP)


Looks like Neural Network on Count vector of combined pos and bow tokens are giving us more accuracy. Let's write them to csv

In [None]:
test_csv = pd.DataFrame(tweet_df_test['id'])
test_csv['gender'] = y_pred_MLP
test_csv['gender'][test_csv['gender'] == 1] = 'male'
test_csv['gender'][test_csv['gender'] == 0] = 'female'
test_csv.to_csv('pred_labels.csv', index = False)