# Author Profiling - Content Based Method

# Importing Required Libraries

In [None]:
import pandas as pd
import numpy as np
import glob
import re
import html
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import RFE
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

# Loading Data From "twitter-gender-corpus" Folder

In [27]:
def load_data(preprocessing):
    path = "twitter-gender-corpus/*/*.txt"
    profiles = glob.glob(path) #get all paths from required folder
    authors = []
    author_type = []
    
    for single_profile in profiles: #append author type field 1 for "male" or  0"female"
        if 'female' in single_profile:
            author_type.append(0)
        else:
            author_type.append(1)
        file = open(single_profile, "r",encoding="utf8")
        text = file.read()
        text = html.unescape(text)
        if(preprocessing): # do some proprocessing like removel of links and special characters
            text = re.sub(r"http\S+"," ",text)
            text = re.sub(r"[^a-zA-Z0-9@#]+"," ",text)
        authors.append(text)
    return {'authors':authors,'author_type':author_type}

# Extract Word N-gram Features

In [None]:
def word_n_gram(n):
    start = time.time()
    vector = TfidfVectorizer( # initialize tfidf vectorizer
        stop_words='english',
        strip_accents='unicode',
        lowercase=True,
        analyzer='word',
        ngram_range=(n,n),
        max_features=1000)
    
    X = vector.fit_transform(authors)
    features = vector.get_feature_names() # features name
    feature_attributes = X.toarray() # fearure values
    data = pd.DataFrame(data=feature_attributes,columns=features) # convert to dataframe
    data['author'] = author_type
    data.to_csv('extracted-features/word_'+str(n)+'_gram.csv',index=False) # save in folder
    end = time.time()
    print('Exported File:','word_'+str(n)+'_gram.csv')
    print('Execution Time:',round(end - start, 2),'sec\n')

# Extract Char N-gram Features

In [None]:
def char_n_gram(n):
    start = time.time()
    vector = TfidfVectorizer( # initialize tfidf vectorizer
        stop_words='english',
        strip_accents='unicode',
        lowercase=True,
        analyzer='char',
        ngram_range=(n,n),
        max_features=1000)
    
    X = vector.fit_transform(authors)
    features = vector.get_feature_names() # features name
    feature_attributes = X.toarray() # fearure values
    data = pd.DataFrame(data=feature_attributes,columns=features) # convert to dataframe
    data['author'] = author_type
    data.to_csv('extracted-features/char_'+str(n)+'_gram.csv',index=False) # save in folder
    end = time.time()
    print('Exported File:','char_'+str(n)+'_gram.csv')
    print('Execution Time:',round(end - start, 2),'sec\n')

# Fearure Selection

In [None]:
def feature_selection(features,name):
    # applying Recursive Feature Elimination to select 50 most prominenet feature based on SVM
    # change 50 to other value if you want
    name = name.split('.')
    X = features.iloc[:,:-1]
    y = features.iloc[:,-1:]
    svm = LinearSVC()
    rfe = RFE(svm, 50)
    rfe = rfe.fit(X, y)
    X[X.columns[rfe.support_]].merge(y, left_index=True, right_index=True, how='inner').to_csv('data/'+name[0]+'_reduced.csv',index=False)
    return X[X.columns[rfe.support_]]


# Train and Test Classifier

In [None]:
def train_and_test(file,classifier):
    # train and test classifier
    start = time.time()
    data = pd.read_csv('extracted-features/'+file)
    X = feature_selection(data,file)
    y = data.iloc[:,-1:]
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=30)
    model = eval(classifier+'()')
    model.fit(X_train,y_train)
    prediction = model.predict(X_test)
    end = time.time()
    print('File:',file)
    print('Execution Time:',round(end - start, 2),'sec')
    print('Accuracy: [',str(classifier),':',accuracy_score(y_test,prediction.round()).round(3),']')
    print('RMS Error: [',str(classifier),':',mean_squared_error(y_test,prediction.round()).round(3),']\n')

# Main

In [None]:
data = load_data(1) #load data with preprocessing
authors = data['authors']
author_type = data['author_type']

# extract word 1,2,3 gram features
word_n_gram(1)
word_n_gram(2)
word_n_gram(3)

In [None]:
authors = load_data(0) #load data without preprocessing
authors = data['authors']
author_type = data['author_type']

# extract char 3-10 gram features
char_n_gram(3)
char_n_gram(4)
char_n_gram(5)
char_n_gram(6)
char_n_gram(7)
char_n_gram(8)
char_n_gram(9)
char_n_gram(10)

In [None]:
# Train and test mentioned algorithms on given features
train_and_test('word_1_gram.csv','GaussianNB')
train_and_test('word_1_gram.csv','LinearSVC')
train_and_test('word_1_gram.csv','AdaBoostClassifier')
train_and_test('word_1_gram.csv','RandomForestClassifier')

In [None]:
train_and_test('word_2_gram.csv','GaussianNB')
train_and_test('word_2_gram.csv','LinearSVC')
train_and_test('word_2_gram.csv','AdaBoostClassifier')
train_and_test('word_2_gram.csv','RandomForestClassifier')

In [None]:
train_and_test('word_3_gram.csv','GaussianNB')
train_and_test('word_3_gram.csv','LinearSVC')
train_and_test('word_3_gram.csv','AdaBoostClassifier')
train_and_test('word_3_gram.csv','RandomForestClassifier')

In [None]:
train_and_test('char_3_gram.csv','GaussianNB')
train_and_test('char_3_gram.csv','LinearSVC')
train_and_test('char_3_gram.csv','AdaBoostClassifier')
train_and_test('char_3_gram.csv','RandomForestClassifier')

In [None]:
train_and_test('char_4_gram.csv','GaussianNB')
train_and_test('char_4_gram.csv','LinearSVC')
train_and_test('char_4_gram.csv','AdaBoostClassifier')
train_and_test('char_4_gram.csv','RandomForestClassifier')

In [None]:
train_and_test('char_5_gram.csv','GaussianNB')
train_and_test('char_5_gram.csv','LinearSVC')
train_and_test('char_5_gram.csv','AdaBoostClassifier')
train_and_test('char_5_gram.csv','RandomForestClassifier')

In [None]:
train_and_test('char_6_gram.csv','GaussianNB')
train_and_test('char_6_gram.csv','LinearSVC')
train_and_test('char_6_gram.csv','AdaBoostClassifier')
train_and_test('char_6_gram.csv','RandomForestClassifier')

In [None]:
train_and_test('char_7_gram.csv','GaussianNB')
train_and_test('char_7_gram.csv','LinearSVC')
train_and_test('char_7_gram.csv','AdaBoostClassifier')
train_and_test('char_7_gram.csv','RandomForestClassifier')

In [None]:
train_and_test('char_8_gram.csv','GaussianNB')
train_and_test('char_8_gram.csv','LinearSVC')
train_and_test('char_8_gram.csv','AdaBoostClassifier')
train_and_test('char_8_gram.csv','RandomForestClassifier')

In [None]:
train_and_test('char_9_gram.csv','GaussianNB')
train_and_test('char_9_gram.csv','LinearSVC')
train_and_test('char_9_gram.csv','AdaBoostClassifier')
train_and_test('char_9_gram.csv','RandomForestClassifier')

In [None]:
train_and_test('char_10_gram.csv','GaussianNB')
train_and_test('char_10_gram.csv','LinearSVC')
train_and_test('char_10_gram.csv','AdaBoostClassifier')
train_and_test('char_10_gram.csv','RandomForestClassifier')