# Document Classifier

**load_files** Loads text files with categories as subfolder names with shuffling. This function returns a dictionary-like object.

In [1]:
# from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import load_files

In [2]:
# categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
# all_of_it = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
all_of_it = load_files("D:\kaam\AdditionalParsed", shuffle=True, random_state = None)

Divide train/test into **80:20** ratio

In [3]:
total = len(all_of_it.data)
num = int(0.8*total)
print("No. of Training data: " , num)
print("No. of Testing data: " , total - num)

train_data = all_of_it.data[:num]
test_data = all_of_it.data[num:]

# The target class names: AoI, MC
all_of_it.target_names

No. of Training data:  1417
No. of Testing data:  355


['AoI', 'MC']

### Some details about the dataset

In [4]:
print(dir(all_of_it))     # Lists the directories available
print(all_of_it.target_names)  # target names are the names in which we want to classify the documents
# print(type(all_of_it.description))
print(len(all_of_it.data))     # There are 1772 text files total in AoI and MC folder
print(all_of_it.target[:10])   # print first 10 file-paths

['DESCR', 'data', 'filenames', 'target', 'target_names']
['AoI', 'MC']
1772
[0 1 1 1 1 1 1 0 1 0]


#### How the files look like:

In [5]:
all_of_it.filenames[:5]


array(['D:\\kaam\\AdditionalParsed\\AoI\\0000927356-97-000018-3.txt',
       'D:\\kaam\\AdditionalParsed\\MC\\0001193125-17-095656-2.txt',
       'D:\\kaam\\AdditionalParsed\\MC\\0001554795-17-000149-2.txt',
       'D:\\kaam\\AdditionalParsed\\MC\\0001559484-17-000004-2.txt',
       'D:\\kaam\\AdditionalParsed\\MC\\0001558370-17-002200-7.txt'], 
      dtype='<U56')

### Sample Document

In [6]:
print(all_of_it.data[0])

b'<FilingToolDoc root-name="Document" wk-pid="SEC-0000927356-97-000018-3"><Meta xmlns="http://www.wolterskluver.com/namespace/meta"><Extention>txt</Extention><Original-Size>158321</Original-Size><Exhibit-Name>Articles of incorporation</Exhibit-Name><Type>3</Type><Filing-Updated>2016-07-28T12:25:52.419497-05:00</Filing-Updated><Source>Filing</Source><Accession-Number>0000927356-97-000018</Accession-Number><Filing-Date>1997-01-10</Filing-Date><Form-Type>10-K</Form-Type><Form-Group>Annual &amp; Quarterly Reports</Form-Group><CIK>0000745655</CIK><Author>INTERCELL CORP</Author><Ticker-Symbol>CCTR</Ticker-Symbol><Issue-Type>COM</Issue-Type><Market>OTC</Market><Sic-Code>4813</Sic-Code><Sic-Name>Telephone Communications, Except Radiotelephone</Sic-Name><Region-Name>COLORADO</Region-Name><Incorporated-Region-Name>NEVADA</Incorporated-Region-Name><State-Of-Incorporation-Name>NEVADA</State-Of-Incorporation-Name><Silent-Update>true</Silent-Update><Filer-Type>Small Reporting Company</Filer-Type><Fi

**CountVectorizer** converts text documents to a matrix representation of token counts.
It takes a list of "strings" as input and creates a matrix

_no. of (documents_) **x** (_vocabulary_)



In [7]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()    #Convert a collection of text documents to a matrix of token counts
X = count_vect.fit_transform(train_data)

You can take a look at the dimensions of the matrix and the vocabulary that it has created

In [8]:
print(X.shape)          # training data contains 1417 samples therefore 1417 rows
count_vect.vocabulary_  # a dictionary which maps term to feature indices

(1417, 46074)


{'filingtooldoc': 22479,
 'root': 37525,
 'name': 30804,
 'document': 19060,
 'wk': 44838,
 'pid': 33860,
 'sec': 38343,
 '0000927356': 299,
 '97': 9723,
 '000018': 78,
 'meta': 29867,
 'xmlns': 45084,
 'http': 25097,
 'www': 45008,
 'wolterskluver': 44876,
 'com': 15837,
 'namespace': 30812,
 'extention': 21917,
 'txt': 42386,
 'original': 32672,
 'size': 39122,
 '158321': 3356,
 'exhibit': 21642,
 'articles': 12243,
 'of': 32117,
 'incorporation': 25873,
 'type': 42392,
 'filing': 22476,
 'updated': 43140,
 '2016': 4148,
 '07': 1913,
 '28t12': 5141,
 '25': 4784,
 '52': 7257,
 '419497': 6553,
 '05': 1799,
 '00': 0,
 'source': 39413,
 'accession': 10301,
 'number': 31822,
 'date': 17597,
 '1997': 3917,
 '01': 1412,
 '10': 2128,
 'form': 22860,
 'group': 24111,
 'annual': 11679,
 'amp': 11473,
 'quarterly': 35339,
 'reports': 36673,
 'cik': 15363,
 '0000745655': 159,
 'author': 12741,
 'intercell': 26453,
 'corp': 16797,
 'ticker': 41557,
 'symbol': 40738,
 'cctr': 14853,
 'issue': 2693

In [9]:
X.toarray()

array([[ 9, 39,  0, ...,  0,  0,  0],
       [ 1, 11,  0, ...,  0,  0,  0],
       [ 2,  2,  0, ...,  0,  0,  0],
       ..., 
       [ 1,  0,  0, ...,  0,  0,  0],
       [ 1,  0,  0, ...,  0,  0,  0],
       [ 1,  0,  0, ...,  0,  0,  0]], dtype=int64)

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer
tf = TfidfTransformer(sublinear_tf=True)
X_train_tf = tf.fit_transform(X)
X_train_tf.shape

(1417, 46074)

In [11]:

X_train_tf.toarray()

array([[ 0.01223872,  0.0236456 ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.01188943,  0.05351074,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.01309894,  0.01735026,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.01325197,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.00578805,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.0335436 ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [12]:
# docs_test = ["God is great", "Retina scan gives early diagnosis about diabetes"]
# X_test = count_vect.transform(docs_test)
# X_tf_test = tf.transform(X_test)

X_test = count_vect.transform(test_data)
X_test_tf = tf.transform(X_test)
X_test_tf.shape

(355, 46074)

In [13]:
# from sklearn.naive_bayes import MultinomialNB
# clf = MultinomialNB()
# clf.fit(X_train_tf,all_of_it.target[:num])

In [14]:
from sklearn import svm
# apply one-vs-one svm classification taking radial basis function as the kernel
clf = svm.SVC(decision_function_shape="ovo", C = 10000.0, kernel='rbf',gamma = 0.6)
clf.fit(X_train_tf, all_of_it.target[:num])

SVC(C=10000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovo', degree=3, gamma=0.6, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [15]:
# from sklearn.externals import joblib
# modelfile = "20-news-svm.sav"
# joblib.dump(clf,modelfile)

# loadedmodel = joblib.load(modelfile)

In [16]:
# Accuracy (score)
print(clf.score(X_test_tf, all_of_it.target[num:]))

# predict = clf.predict(X_test_tf)
# for text, category in zip(docs_test,predict):
#     print(text + "\tbelongs to:\t" + all_of_it.target_names[category])

1.0


In [17]:
# all_of_it.target_names[int(clf.predict(tf.fit_transform(count_vect.transform(f.read()))))]
HOME_DIR = "D:\kaam\AdditionalParsedTest"
import os
print("File:\tClassification:")
for home,subdir,files in os.walk(HOME_DIR):
    for file in files:
        with open(os.path.join(HOME_DIR, file)) as f:
                print(file + "\t" + all_of_it.target_names[int(clf.predict(tf.transform(count_vect.transform([f.read()]))))])


File:	Classification:
AOI1.txt	AoI
AOI2.txt	AoI
AOI3.txt	AoI
AOI4.txt	AoI
AOI5.txt	AoI
MC1.txt	MC
MC2.txt	MC
MC3.txt	MC
MC4.txt	MC
MC5.txt	MC
