# Stylometry Based Method

# Importing Required Libraries

In [65]:
import pandas as pd
import numpy as np
import glob
import time
import html
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

# Load Data and Export Features

In [66]:
def load_and_export():
    start = time.time()
    path = "twitter-gender-corpus/*/*.txt"
    profiles = glob.glob(path)
    features = pd.DataFrame(columns=['small_letters','capital_letters','spaces','digits',
                                 ',','.','@','(',')','!','-','?','%','&','#','_','=',';',':','/','author']) # Features
    
    for single_profile in profiles:
        author = 0 if 'female' in single_profile else 1
        file = open(single_profile, "r",encoding="utf8")
        text = file.read()
        text = html.unescape(text)
        features = features.append({'small_letters':sum(c.islower() for c in text),
                                    'capital_letters':sum(c.isupper() for c in text),
                                    'spaces':text.count(' '),
                                    'digits':sum(c.isdigit() for c in text),
                                    ',':text.count(','),
                                    '.':text.count('.'),
                                    '@':text.count('@'),
                                    '(':text.count('('),
                                    ')':text.count(')'),
                                    '!':text.count('!'),
                                    '-':text.count('-'),
                                    '?':text.count('?'),
                                    '%':text.count('%'),
                                    '&':text.count('&'),
                                    '#':text.count('#'),
                                    '_':text.count('_'),
                                    '=':text.count('='),
                                    ';':text.count(';'),
                                    ':':text.count(':'),
                                    '/':text.count('/'),
                                    'author':author} , ignore_index=True)
        features.to_csv('extracted-features/stylometry.csv',index=False)
    end = time.time()
    print('Exported File: stylometry.csv')
    print('Execution Time:',round(end - start, 2),'sec')

# Train and Test Classifier

In [73]:
def train_and_test(file,classifier):
    start = time.time()
    stylometry = pd.read_csv('extracted-features/'+file)
    X = stylometry.iloc[:,:-1]
    y = stylometry.iloc[:,-1:]
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=30)
    model = eval(classifier+'()')
    model.fit(X_train,y_train)
    prediction = model.predict(X_test)
    end = time.time()
    print('File:',file)
    print('Execution Time:',round(end - start, 2),'sec')
    print('Accuracy: [',str(classifier),':',accuracy_score(y_test,prediction.round()).round(3),']')
    print('RMS Error: [',str(classifier),':',mean_squared_error(y_test,prediction.round()).round(3),']')

# Main

In [74]:
load_and_export()

Exported File: stylometry.csv
Execution Time: 48.31 sec


In [75]:
train_and_test('stylometry.csv','GaussianNB')

File: stylometry.csv
Execution Time: 0.02 sec
Accuracy: [ GaussianNB : 0.631 ]
RMS Error: [ GaussianNB : 0.369 ]


In [76]:
train_and_test('stylometry.csv','LinearSVC')

File: stylometry.csv
Execution Time: 0.08 sec
Accuracy: [ LinearSVC : 0.574 ]
RMS Error: [ LinearSVC : 0.426 ]


In [77]:
train_and_test('stylometry.csv','AdaBoostClassifier')

File: stylometry.csv
Execution Time: 0.22 sec
Accuracy: [ AdaBoostClassifier : 0.638 ]
RMS Error: [ AdaBoostClassifier : 0.362 ]


In [78]:
train_and_test('stylometry.csv','RandomForestClassifier')

File: stylometry.csv
Execution Time: 0.03 sec
Accuracy: [ RandomForestClassifier : 0.596 ]
RMS Error: [ RandomForestClassifier : 0.404 ]
