# Author Identification 

## Part-2 : Cross Validation

### Import Libraries

In [2]:
import numpy as np
import pandas as pd
import os
from nltk import word_tokenize
from nltk.corpus import words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score , confusion_matrix , accuracy_score
from sklearn.svm import SVC
import plot_confusion_matrix
import matplotlib.pyplot as plt
import glob , itertools , add_feature

%matplotlib inline

### Create Dataset from author folders (Reuters C-50 dataset) 

In [9]:
df = pd.DataFrame()

dfs=[]
author_names = sorted(os.listdir('text_corpus/C50train/'))
number_authors = len(author_names)
for authors in author_names:
    article_corpus=[]
    for file in glob.glob(os.path.join('text_corpus/C50train',authors,'*.txt')):
        with open(file) as fh:
            article_corpus.append(fh.read())

    dfs.append(pd.DataFrame({'Story':article_corpus , 'Author':authors}))

dataset = pd.concat(dfs , axis=0)
for author_idx in range(0,50*number_authors,50):
    vect = TfidfVectorizer().fit_transform(dataset.iloc[author_idx:author_idx+50,1])
    similarity = (vect * vect.T).A
    
    visited = np.zeros([50])
    for i in range(50):
        if(not visited[i]):
            visited[i] = 1
            idx = similarity[i,:] > 0.6
            sim_list = np.where(idx)[0]
            if len(sim_list) > 0:
                for doc_id in sim_list[1:]:
                    dataset.iloc[author_idx+doc_id,1] = np.nan
                    visited[doc_id] = 1

dataset = dataset.dropna()
counts = dataset.groupby('Author').agg('count').reset_index()
author_shortlist = counts.sort_values('Story',ascending=False).iloc[10:20,0].values
dataset_valid = dataset[dataset['Author'].isin(author_shortlist)].reset_index(drop=True)

#### Change labels to encoding

In [10]:
encoding = LabelEncoder().fit(dataset_valid['Author'])
author_ids = list(encoding.transform(dataset_valid['Author']))
dataset_valid['Author_id'] = author_ids

### Build Model and Train

#### Train-test split the data

In [11]:
X_train = dataset_valid['Story'].values
y_train = dataset_valid['Author_id'].values

#### Build the features using tf-idf vectorizer

In [12]:
vect = TfidfVectorizer(min_df=7,max_df=0.90,ngram_range=(3,5),analyzer='char_wb').fit(X_train)
X_train_transform = vect.transform(X_train)

#### Model using linear SVC

In [17]:
model = SVC(kernel='linear',C=10,class_weight='balanced')

score_f1 = cross_val_score(model , X_train_transform , y_train , scoring='f1_weighted')

print('Obtained f1 accuracy: ' , score_f1*100)

Obtained f1 accuracy:  [ 83.3116026   90.99290231  88.77049335]
