In [83]:
import numpy as np
import pandas as pd
import os
from nltk import word_tokenize
from nltk.corpus import words
from sklearn.metrics import f1_score , 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

In [2]:
import glob
df = pd.DataFrame()

In [5]:
dfs=[]
author_names = sorted(os.listdir('text_corpus/C50train/'))
number_authors = len(author_names)
for authors in author_names:
    article_corpus=[]
    for file in glob.glob(os.path.join('text_corpus/C50train',authors,'*.txt')):
        with open(file) as fh:
            article_corpus.append(fh.read())

    dfs.append(pd.DataFrame({'Story':article_corpus , 'Author':authors}))

In [45]:
dataset = pd.concat(dfs , axis=0)

In [46]:
for author_idx in range(0,50*number_authors,50):
    vect = TfidfVectorizer().fit_transform(dataset.iloc[author_idx:author_idx+50,1])
    similarity = (vect * vect.T).A
    
    visited = np.zeros([50])
    for i in range(50):
        if(not visited[i]):
            visited[i] = 1
            idx = similarity[i,:] > 0.6
            sim_list = np.where(idx)[0]
            if len(sim_list) > 0:
                for doc_id in sim_list[1:]:
                    dataset.iloc[author_idx+doc_id,1] = np.nan
                    visited[doc_id] = 1

In [47]:
dataset = dataset.dropna()
counts = dataset.groupby('Author').agg('count').reset_index()
author_list_largest = counts.sort_values('Story',ascending=False).iloc[2:9,0].values
dataset_valid = dataset[dataset['Author'].isin(author_list_largest)]
dataset_valid.shape

(297, 2)

In [67]:
dataset_valid.reset_index(drop=True)

Unnamed: 0,Author,Story,Author_id
0,LynnleyBrowning,"Russia, dramatically changing its approach to ...",0
1,LynnleyBrowning,Western oil companies frustrated by Moscow's f...,0
2,LynnleyBrowning,Russia is finding the return of competitive Ir...,0
3,LynnleyBrowning,Russia is eyeing its second consecutive bad gr...,0
4,LynnleyBrowning,Russia's oil industry earned higher export rev...,0
5,LynnleyBrowning,"Russia, hoping to boost industrial output, ann...",0
6,LynnleyBrowning,"Russia, whose sudden introduction of export cu...",0
7,LynnleyBrowning,Russia is trying to make things tougher for ex...,0
8,LynnleyBrowning,Russia is trying to shift financing of its tar...,0
9,LynnleyBrowning,Russia's state oil pipeline firm Transneft on ...,0


In [48]:
## Label Encoding
encoding = LabelEncoder().fit(dataset_valid['Author'])
author_ids = list(encoding.transform(dataset_valid['Author']))
dataset_valid['Author_id'] = author_ids


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,Author,Story,Author_id
0,LynnleyBrowning,"Russia, dramatically changing its approach to ...",0
1,LynnleyBrowning,Western oil companies frustrated by Moscow's f...,0
2,LynnleyBrowning,Russia is finding the return of competitive Ir...,0
3,LynnleyBrowning,Russia is eyeing its second consecutive bad gr...,0
4,LynnleyBrowning,Russia's oil industry earned higher export rev...,0
5,LynnleyBrowning,"Russia, hoping to boost industrial output, ann...",0
6,LynnleyBrowning,"Russia, whose sudden introduction of export cu...",0
7,LynnleyBrowning,Russia is trying to make things tougher for ex...,0
8,LynnleyBrowning,Russia is trying to shift financing of its tar...,0
9,LynnleyBrowning,Russia's state oil pipeline firm Transneft on ...,0


In [34]:
##Learning Starts

In [77]:
X_train , X_test , y_train , y_test = train_test_split(dataset_valid['Story'].values , dataset_valid['Author_id'].values , test_size=0.1)

In [80]:
vect = TfidfVectorizer().fit(X_train)
X_train_transform = vect.transform(X_train)
X_train_transform.shape

(267, 10615)

In [121]:
model = SVC(kernel='linear',C=10 , class_weight='balanced')
model.fit(X_train_transform , y_train)

predictions = model.predict(vect.transform(X_test))

score = f1_score(y_test , predictions , average='weighted')

In [122]:
score

0.65574795574795575

In [104]:
y_train

array([0, 1, 5, 4, 3, 5, 0, 1, 1, 3, 3, 6, 2, 4, 6, 5, 5, 6, 3, 4, 2, 6, 1,
       4, 3, 1, 3, 0, 6, 3, 2, 0, 3, 5, 1, 3, 3, 6, 1, 2, 1, 0, 2, 5, 3, 4,
       2, 3, 1, 1, 1, 5, 1, 3, 2, 2, 5, 6, 0, 0, 6, 6, 1, 2, 2, 5, 5, 4, 0,
       6, 1, 2, 0, 4, 6, 5, 6, 5, 0, 0, 3, 4, 2, 6, 3, 5, 6, 0, 1, 5, 2, 4,
       1, 3, 4, 0, 6, 6, 5, 1, 4, 4, 0, 3, 2, 3, 5, 1, 3, 4, 1, 0, 5, 3, 1,
       1, 1, 0, 4, 1, 6, 4, 0, 6, 1, 5, 5, 4, 2, 3, 1, 1, 3, 5, 0, 3, 5, 1,
       1, 2, 0, 5, 4, 2, 6, 0, 3, 1, 3, 4, 6, 1, 4, 3, 3, 3, 6, 4, 4, 0, 6,
       1, 4, 2, 1, 5, 3, 2, 3, 0, 1, 1, 4, 4, 6, 5, 5, 5, 5, 6, 0, 1, 2, 0,
       1, 5, 4, 4, 6, 3, 1, 6, 1, 6, 3, 3, 2, 5, 2, 5, 2, 4, 2, 4, 5, 5, 3,
       2, 0, 4, 2, 0, 1, 2, 5, 1, 1, 0, 6, 0, 0, 4, 4, 1, 5, 6, 6, 0, 4, 3,
       5, 6, 1, 6, 5, 6, 0, 3, 5, 0, 0, 0, 2, 2, 3, 3, 2, 6, 2, 4, 2, 5, 4,
       2, 5, 0, 6, 2, 6, 2, 6, 0, 6, 0, 1, 0, 3])