# Authorship Profiling: Gender classification on Twitter-Posts

###### Name: Mohammad Zaki Gundagi

Programming Language: Python 3 in Jupyter Notebook 6.0.0

Python Libraries used:
- nltk
- pandas
- zipfile
- re
- numpy
- text2digits
- sklearn.model_selection - train_test_split, GridSearchCV
- sklearn.linear_model - LogisticRegression, SGDClassifier
- sklearn.metrics - metrics
- sklearn.svm - SVC
- sklearn.feature_extraction.text - TfidfVectorizer

## Table of Contents

* [Data extraction](#sec_1)
* [Data pre-processing and featurization](#sec_2)
* [Classifier](#sec_3)
* [Evaluation](#sec_4)
* [Conclusion](#sec_5)

In [1]:
# libraries
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import *
import pandas as pd
import zipfile
import re
import numpy as np

#!pip install text2digits
from text2digits import text2digits 

from sklearn.model_selection import train_test_split, GridSearchCV

# for creating logistic model and linear SVM model
from sklearn.linear_model import LogisticRegression, SGDClassifier

# for creating support vector machine (SVM) model
from sklearn.svm import SVC

import sklearn.metrics as metrics
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ariha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 1. Data extraction <a class="anchor" id="sec_1"></a>

In [2]:
# read the data
train_label= pd.read_csv('train_labels.csv')
test_data=pd.read_csv('test.csv')
z= zipfile.ZipFile('data.zip','r')

# creating 2 lists to save twitter posts and gender of authors for training
train=[]
train_gender=[]

i = 0
while i < len(train_label):
    # encode the xml data and convert to lowercase letters
    data=z.read('data/'+train_label['id'][i]+'.xml').decode("utf-8").lower()

    # extract post body
    pattern1 = re.compile(r'cdata\[(.*?)\]]>',re.DOTALL)  
    data1 = pattern1.findall(data)
    
    # save body data to train list
    train = train+[data1]
    
    # extract gender(label)
    train_gender = train_gender+[train_label['gender'][i]]
    i = i+1

# 2. Data pre-processing and featurization <a class="anchor" id="sec_2"></a>

In [3]:
# To remove all such words with apostrophe
contractions = {
"aint": "am not",
"arent": "are not",
"cant": "cannot",
"cantve": "cannot have",
"cause": "because",
"couldve": "could have",
"couldnt": "could not",
"couldntve": "could not have",
"didnt": "did not",
"doesnt": "does not",
"dont": "do not",
"hadnt": "had not",
"hadntve": "had not have",
"hasnt": "has not",
"havent": "have not",
"hed": "he had",
"hedve": "he would have",
"hell": "he shall",
"hellve": "he shall have",
"hes": "he has",
"howd": "how did",
"howdy": "how do you",
"howll": "how will",
"hows": "how has",
"id": "I had",
"idve": "I would have",
"ill": "I shall",
"illve": "I shall have",
"im": "I am",
"ive": "I have",
"isnt": "is not",
"itd": "it had",
"itdve": "it would have",
"itll": "it shall",
"itllve": "it shall have",
"its": "it has",
"lets": "let us",
"maam": "madam",
"maynt": "may not",
"mightve": "might have",
"mightnt": "might not",
"mightntve": "might not have",
"mustve": "must have",
"mustnt": "must not",
"mustntve": "must not have",
"neednt": "need not",
"needntve": "need not have",
"oclock": "of the clock",
"oughtnt": "ought not",
"oughtntve": "ought not have",
"shant": "shall not",
"shant": "shall not",
"shantve": "shall not have",
"shed": "she had",
"shedve": "she would have",
"shell": "she shall",
"shellve": "she shall have",
"shes": "she has",
"shouldve": "should have",
"shouldnt": "should not",
"shouldntve": "should not have",
"sove": "so have",
"sos": "so as",
"thatd": "that would",
"thatdve": "that would have",
"thats": "that has",
"thered": "there had",
"theredve": "there would have",
"theres": "there has",
"theyd": "they had",
"theydve": "they would have",
"theyll": "they shall",
"theyllve": "they shall have",
"theyre": "they are",
"theyve": "they have",
"tove": "to have",
"wasnt": "was not",
"wed": "we had",
"wedve": "we would have",
"well": "we will",
"wellve": "we will have",
"were": "we are",
"weve": "we have",
"werent": "were not",
"whatll": "what shall",
"whatllve": "what shall have",
"whatre": "what are",
"whats": "what has",
"whatve": "what have",
"whens": "when has",
"whenve": "when have",
"whered": "where did",
"wheres": "where has",
"whereve": "where have",
"wholl": "who shall",
"whollve": "who shall have",
"whos": "who has",
"whove": "who have",
"whys": "why has",
"whyve": "why have",
"willve": "will have",
"wont": "will not",
"wontve": "will not have",
"wouldve": "would have",
"wouldnt": "would not",
"wouldntve": "would not have",
"yall": "you all",
"yalld": "you all would",
"yalldve": "you all would have",
"yallre": "you all are",
"yallve": "you all have",
"youd": "you had",
"youdve": "you would have",
"youll": "you shall",
"youllve": "you shall have",
"youre": "you are",
"youve": "you have"
}

## 2.1 Tokenization and removing stop words:

In [None]:
words=[]
i = 0
while i < len(train):
    data=train[i]
    word_set=[]
    for sentences in data:
        
        # remove space>1
        sentences= re.sub(r'\s+', ' ', sentences)
        
        # remove apostrophe s
        sentences= re.sub(r'\'s|\' s', '', sentences)
        
        # remove symbols
        sentences= re.sub(r'[^\w\s]','', sentences)
        
        # remove English language contractions
        for word in sentences.split():
            if word in contractions:
                sentences = sentences.replace(word, contractions[word])
                
        # split by space 
        word = sentences.split(' ')  
        word1=[]
        
        for string in word:
            # delete web link
            w=re.sub(r'https.*', '', string)
            word1=word1+[w]
            
        word_set=word_set+word1
    clean = []
    # remove stop words
    filtered_words = [word for word in word_set if word not in stopwords.words('english')]
    for word in filtered_words:
        if len(word)>0:
            clean = clean+[word]
    words=words+[clean]
    i=i+1

## 2.2 Removing the most/least frequent words

In [None]:
i = 0
dis_words=[]
while i < len(words):
    data = words[i]
    # distinct words in each person's post
    dis_words=dis_words+[list(set(data))]
    i = i+1

In [None]:
word_dict={}
for lists in dis_words:
    for item in lists:
        # library to count word frequency
        word_dict[item] = word_dict.get(item,0)+1

In [None]:
remove_list=[]
for key in word_dict:
    # most and least frequent words
    if word_dict[key]>3100*0.95 or word_dict[key]< 3100*0.05:
        remove_list = remove_list+[key]

In [None]:
clean_word=[]
i = 0
while i < len(words):
    data= words[i]
    clean = []
    for word in data:
        if word not in remove_list:
            clean = clean+[word]
    clean_word = clean_word+[clean]
    i = i+1
    print(i)

In [None]:
# np.save('data.npy',clean_word)

In [4]:
preprocessed_train_data = np.load('data.npy',allow_pickle=True)
preprocessed_train_data = preprocessed_train_data.tolist()


## 2.3 Removing numbers and small words

In [5]:
t2d = text2digits.Text2Digits() 

In [6]:
# convert numerical numbers to digits
preprocessed_train_data = [[t2d.convert(word) for word in arr] for arr in preprocessed_train_data]

# remove numbers
preprocessed_train_data=[[word for word in arr if word.isalpha()] for arr in preprocessed_train_data]

# remove single and 2 letter words
preprocessed_train_data = [[word for word in arr if len(word) >= 3] for arr in preprocessed_train_data]

## 2.4 Stemming 

In [7]:
stemmer = PorterStemmer()
preprocessed_train_data = [[stemmer.stem(word) for word in arr] for arr in preprocessed_train_data]

## 2.5 Create dataframe to be used in featurization

In [8]:
review_data1 = list(zip(preprocessed_train_data, train_gender))

In [9]:
train_df = pd.DataFrame(review_data1, columns =['Words', 'gender'])

## 2.6 TF-IDF feature extraction for training data

In [10]:
def identity_tokenizer(text):
    return text

vectorizer = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False)
x_train = vectorizer.fit_transform(train_df['Words'])

## 2.7 One Hot Encoding

One Hot Encoding featurization technique - `Not being used for the optimal model`

The following code is just for representation.

`def one_hot():  
    input = preprocessed_train_data</t>
    dis_list =[]  
    for word in input:  
        dis_list = dis_list+word  
        dis_list = list(set(dis_list))`<br>
        &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`# create an empty dataframe with unique words
        df=pd.DataFrame(columns = dis_list)
    z = 0
    while z < len(input):  
        df.loc[z] = 0
        for item in input[z]:`<br>
            &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`# add one for each appear
            df.loc[z,item] = df.loc[z,item]+1 
        z =z+1  
    return df  
df=one_hot()`

# 3. Classifier <a class="anchor" id="sec_3"></a>

## 3.1 KNN

`def test_accuracy():  
    train_error = []  
    test_error = []  
    i=1
    while i < 100:  
        neigh = KNeighborsClassifier(n_neighbors=i)  
        neigh.fit(df, train_gender)  
        predict = neigh.predict(df)  
        predict1 = neigh.predict(test_df)  
        z = 0  
        a = 0  
        b = 0  
        while z < len(predict):  
            if predict[z]!=train_gender[z]:  
                a=a+1  
            z=z+1  
        z = 0  
        while z < len(predict1):  
            if predict1[z]!=test_gender[z]:  
                b=b+1  
            z = z+1  
        train_error = train_error+[a]  
        test_error = test_error+[b]  
        i = i+1
    for item in test_error:  
        print((500-item)/500)`

## 3.2 Logistic Model Training and validation

Spliting the dataset into traning data(90%) and validation data/test data(10%)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_train, train_df['gender'],test_size=0.10)

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }

By setting the dolver as 'liblinear', we support both L1 and L2 regularization.

In [None]:
clf = GridSearchCV(LogisticRegression(random_state=1, solver='liblinear'),param_grid).fit(X_train, y_train)

In [None]:
pred=clf.predict(X_test)

Calculating accuracy for 90% train data and 10% test data(from train data only)

In [None]:
print(metrics.accuracy_score( y_test,pred))

## 3.3 SVM

#### Divinding the training data into 80-20 ratio. 80 - training and 20 - testing

`# Doing 80-20 split
X_train, X_test, y_train, y_test = train_test_split(x_train, train_label['gender'], test_size=0.2, random_state=0)`

#### Trying a simple SVM model and testing it

`# Building SVM model
clf_1 = SVC(probability=True, kernel='rbf')
clf_1.fit(X_train, y_train)`<br>

`# Making predictions and testing accuracy
predictions = clf_1.predict(X_test)
print(metrics.accuracy_score(y_test,predictions))`

0.7758064516129032

#### Trying another model with modified parameters and testing it

`# Building SVM model
clf_2 = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)
clf_2.fit(X_train, y_train)`<br>

`# Making predictions and testing accuracy
predictions = clf_2.predict(X_test)
print(metrics.accuracy_score(y_test,predictions))`

0.7725806451612903

We notice simpler model worked better than this one.

#### Let's find the best model using Grid Search Cross Validation

`# Parameters
Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas}`<br>

`# Building several SVM models
gs_clf = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=5)
gs_clf = gs_clf.fit(X_train, y_train)
print(gs_clf.best_params_)`<br>

`# Utilising the best model chosen
predictions = gs_clf.predict(X_test)
print(metrics.accuracy_score(y_test,predictions))`

{'C': 1, 'gamma': 1} <br>
0.7774193548387097

The best model selected has an accuracy of 77.74%.

# 4. Evaluation <a class="anchor" id="sec_4"></a>

## 4.1 Reading and preprocessing the test data

In [None]:
# creating a list to save twitter posts of authors for evaluation
test_body = []

i = 0
while i < len(test_data):
    # encode the xml data and convert to lowercase letters
    data=z.read('data/'+test_data['id'][i]+'.xml').decode("utf-8").lower()
    
    # extract post body
    pattern = re.compile(r'cdata\[(.*?)\]]>',re.DOTALL)  
    data1 = pattern.findall(data)
    
    # saving the posts in the list
    test_body = test_body+[data1]
    i = i+1

## 4.2 Tokenization and removing stop words:

In [None]:
words_test=[]
i = 0
while i < len(test_body):
    data=test_body[i]
    word_set=[]
    for sentences in data:
        # remove space>1
        sentences= re.sub(r'\s+', ' ', sentences)
        
        # remove apostrophe s
        sentences= re.sub(r'\'s|\' s', '', sentences)
        
        # remove symbols
        sentences= re.sub(r'[^\w\s]','',sentences)
        
        # remove English language contractions
        for word in sentences.split():
            if word in contractions:
                sentences = sentences.replace(word, contractions[word])
                
        # split by space
        word = sentences.split(' ')
        word1=[]
        
        for string in word:
            # delete web link
            w=re.sub(r'https.*', '', string)
            word1=word1+[w]
        
        word_set=word_set+word1
        
    clean = []
    # remove stop words
    filtered_words = [word for word in word_set if word not in stopwords.words('english')]
    for word in filtered_words:
        if len(word)>0:
            clean = clean+[word]
            
    words_test=words_test+[clean]
    i=i+1

##  4.3 Removing the most/least frequent words

In [None]:
i = 0

dis_words=[]
while i < len(words_test):
    data = words_test[i]
    # distinct words in each person's post
    dis_words=dis_words+[list(set(data))]
    i = i+1

In [None]:
word_dict={}
for lists in dis_words:
    for item in lists:
        # library to count word frequency
        word_dict[item] = word_dict.get(item,0)+1

In [None]:
remove_list=[]
for key in word_dict:
    # most and least frequent words
    if word_dict[key]>500*0.95 or word_dict[key]< 500*0.05:
        remove_list = remove_list+[key]

In [None]:
clean_word_test=[]
i = 0
while i < len(words_test):
    data= words_test[i]
    clean = []
    for word in data:
        if word not in remove_list:
            clean = clean+[word]
    clean_word_test = clean_word_test+[clean]
    i = i+1

Saved preprocessed data for quick future use...

In [30]:
# np.save('data_test.npy',clean_word_test)

In [17]:
preprocessed_test_data=np.load('data_test.npy',allow_pickle=True)
preprocessed_test_data=preprocessed_test_data.tolist()

## 4.4 Removing numbers and small words

In [18]:
# convert numerical numbers to digits
preprocessed_test_data = [[t2d.convert(word) for word in arr] for arr in preprocessed_test_data]

# remove numbers
preprocessed_test_data=[[word for word in arr if word.isalpha()] for arr in preprocessed_test_data]

# remove single and 2 letter words
preprocessed_test_data = [[word for word in arr if len(word) >= 3] for arr in preprocessed_test_data]

## 4.5 Stemming

In [19]:
stemmer = PorterStemmer()
preprocessed_test_data = [[stemmer.stem(word) for word in arr] for arr in preprocessed_test_data]

## 4.6 TF-IDF feature extraction for test data

In [20]:
real_test =vectorizer.transform(preprocessed_test_data)

## 4.7 Logistic Model

### 4.7.1 Training Logistic Model over complete data

In [None]:
clf = GridSearchCV(LogisticRegression(random_state=1, solver='liblinear'),param_grid).fit(x_train,train_df['gender'])

### 4.7.2 Predicting the gender

In [None]:
pred_test=clf.predict(real_test)

### 4.7.3 Writing Labels Into CSV File

In [None]:
test_data1 = pd.DataFrame( columns =['id', 'gender'])

In [None]:
test_data1['gender']=pred_test
test_data1['id']=test_data['id']

In [None]:
test_data1.to_csv("pred_labels.csv",index=False)

## 4.8 KNN

### 4.8.1 create the dataframe with the same header of training data

`i = 0  
dis_list =[]  
while i < len(preprocessed_train_data):`<br>
    &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`# unique words
    data = list(set(preprocessed_train_data[i]))
    dis_list = dis_list + data
    i=i+1
dis_list = list(set(dis_list))`<br>
`# create an empty dataframe with unique words
test_df=pd.DataFrame(columns = dis_list)
z = 0  
while z < len(clean_test):
    test_df.loc[z] = 0
    for item in preprocessed_test_data[z]:
        if item in dis_list:`<br>
            &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`# add one for each appear
            test_df.loc[z,item] = test_df.loc[z,item]+1
    z =z+1
test_df.head()`

### 4.8.2 check the accuracy

`test_gender = pd.read_csv('test_labels.csv')['gender']  
test_accuracy()`

## 4.9 SVM

### 4.9.1 Using the best params found to create the model
C = 1.0 and gamma = 1

`# Building SVM model
clf = SVC(probability=True, C=1.0, kernel='rbf', gamma=1)
clf.fit(X_train, y_train)`

### 4.9.2 Making predictions and finding accuracy

`# Making predictions and finding accuracy
predictions = clf.predict(real_test)
labelled_test_data = pd.read_csv('test_labels.csv')
print(metrics.accuracy_score(labelled_test_data['gender'],predictions))`

Upon running the code above, we find that the SVM model performed well and has an accuracy of 77.8%.

# 5. Conclusions <a class="anchor" id="sec_5"></a>

The accuracy for the logistic regression model (best performer) for test data is 78.66%.