In [1]:
import pickle
import sklearn
from os.path import join
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
import time
import os, errno

In [2]:
basepath = "/mfs/replicated/datasets/theme_en_echo/"
#basepath = "/mfs/replicated/datasets/theme_cs_golf/"

SUBJECT_SALT = "_aGef194Kl"

# DEFINE OUTPUT FILES
corpusFilename = basepath + "corpus.pkl"
corpusPreprocessedFilename = basepath + "corpusPreprocessed.pkl"
labelsIdsFilename = basepath + "labelsIds.pkl"
splitIndicesFilename = basepath + "splitIndices.pkl"
vectorizerFilenameTemplate = basepath + "theme-categorization_%s_vectorizer_lemma_%s.pkl"
modelFilenameTemplate = basepath + "theme-categorization_%s_model_lemma_%s.pkl"
reportFilenameTemplate = basepath + "%s-report.txt"

name = "echo"
language = "en"

#name = "golf"
#language = "cs"

processedFolderPath = join(basepath, "..", "processed")
vectorizerFilename = vectorizerFilenameTemplate % (name, language)
modelFilename = modelFilenameTemplate % (name, language)
reportFilename = reportFilenameTemplate % (name)

# Loading converted data

In [3]:
[labels, ids]  = pickle.load( open(join(processedFolderPath,labelsIdsFilename), "rb" ))
corpus = pickle.load( open(join(processedFolderPath,corpusFilename), "rb" ))

In [4]:
corpus[:2]

[("you're bank might owe you money the claims guys marie, start reclaiming your mis-sold ppi money today! \xc2\xa0 could you be owed money for mis-sold payment protection insurance (ppi) on a credit card or loan? \xc2\xa0 natwest \xc2\xa0 lloyds \xc2\xa0 barclays \xc2\xa0 any other lender \xc2\xa0 \xc2\xa0 start your 30 second application* \xc2\xbb if you've had a loan or credit card with lloyds, barclays, natwest or any other lender, you could be one of the millions who was mis-sold a ppi policy. \xc2\xa0 \xc2\xa0 start your 30 second application* \xc2\xbb \xc2\xa0 \xc2\xa0 \xc2\xa0 choose the claims guys \xe2\x80\xa2 \xc2\xa3300m claimed back for customers** \xe2\x80\xa2 average tcg customer claim, \xc2\xa33,200** \xe2\x80\xa2 we do not charge upfront fees \xe2\x80\xa2 no win, no fee*** \xc2\xa0 \xc2\xa0 *online application to request a call back from the claims guys ** figure applies across all lenders and across all the claims guys customers ***a fee will be payable for all claim(s

In [5]:
labels[:2]

('loan_nonbank', 'loan_nonbank')

In [6]:
ids[:2]

('loan_nonbank/loan_nonbank_charlie/0002923b62be19dddab7c60285bec9b6',
 'loan_nonbank/loan_nonbank_charlie/0012c8a80ea0f8f557c04e2d7ab91089')

# Loading preprocessed corpus

In [7]:
processedCorpus = pickle.load( open(join(processedFolderPath,corpusPreprocessedFilename), "rb" ))

In [8]:
len(processedCorpus)

6106

In [9]:
processedCorpus[:2]

[u'bank owe money claims guys marie start reclaiming mis sold ppi money today owed money mis sold payment protection insurance ppi credit card loan natwest lloyds barclays lender start second application loan credit card lloyds barclays natwest lender millions mis sold ppi policy start second application choose claims guys claimed customers average tcg customer claim charge upfront fees win fee online application request claims guys figure applies lenders claims guys customers fee payable claim cancelled day cooling period reasonable offer lender claims guys lynfield house church street altrincham cheshire registered england company regulated claims management regulator repsect regulated claims management activities crm registration recorded website www gov moj cmr hope enjoyed receiving email longer wish receive emails company visit link unsubscribe claims guys marie start reclaiming mis sold ppi money today owed money mis sold payment protection insurance ppi credit card loan start s

# DATA SPLIT

In [10]:
stratifiedSplit = StratifiedShuffleSplit(labels, 1, test_size=0.2, random_state=0)
[train_indices, test_indices] = stratifiedSplit.__iter__().next()

In [11]:
##### APPLY INDICES
processedCorpus_train = [processedCorpus[i] for i in train_indices]
labels_train = [labels[i] for i in train_indices]
ids_train = [ids[i] for i in train_indices]

processedCorpus_test = [processedCorpus[i] for i in test_indices]
labels_test = [labels[i] for i in test_indices]
ids_test = [ids[i] for i in test_indices]

In [12]:
len(train_indices), len(test_indices)

(4884, 1222)

In [13]:
train_indices[:2]

array([1177, 3414])

In [14]:
processedCorpus_train[:2]

[u'great ideas come great rewards pentair thermal open innovation awards view email web page click dear vladim\xedr pentair thermal management history built innovations based great customers like pentair open innovation awards decided chance turn experience imagination special great ideas wholesaler installer engineer employee want hear great ideas idea idea make products easier use matter big small simple complex like hear great rewards course great ideas come great rewards evaluate ideas select final winner wins vip tickets final formula race london june able present idea receive official pentair open innovation award cheque start idea great ideas wait register today seconds submit idea time april stay imaginative linda kiss technology innovation director trademarks property respective owners rights reserved pentair email sent pentair wayzata blvd minneapolis usa update profile unsubscribe vladim\xedr_aGef194Kl ready_aGef194Kl ideas_aGef194Kl',
 u'central bank nigeria office director

# VECTORIZE

In [15]:
vectorizer = TfidfVectorizer(norm='l2', use_idf=True, smooth_idf=True, max_features=50000)
X_train = vectorizer.fit_transform(processedCorpus_train)
X_test = vectorizer.transform(processedCorpus_test)

In [16]:
X_train.shape

(4884, 29591)

In [17]:
X_train[0].shape

(1, 29591)

In [18]:
X_train[0].size

94

In [19]:
print X_train[:2]

  (0, 12649)	0.0790155074896
  (0, 21574)	0.0496985712631
  (0, 28219)	0.0857707953929
  (0, 27588)	0.0276759082009
  (0, 20881)	0.0413383699962
  (0, 27628)	0.0340373720028
  (0, 27703)	0.0336356692108
  (0, 17005)	0.0790155074896
  (0, 3106)	0.0590590608902
  (0, 28554)	0.0818192052879
  (0, 23652)	0.0251073671344
  (0, 22299)	0.035324969954
  (0, 22627)	0.0344815622129
  (0, 19231)	0.0498196389114
  (0, 22354)	0.0524622451022
  (0, 20985)	0.0485640568863
  (0, 26737)	0.0507120363302
  (0, 7392)	0.0275016487882
  (0, 26032)	0.0499422294631
  (0, 14585)	0.0790155074896
  (0, 15444)	0.0611554977528
  (0, 12767)	0.0857707953929
  (0, 25004)	0.0453155061718
  (0, 1505)	0.0524622451022
  (0, 26458)	0.0237075870697
  :	:
  (1, 6523)	0.0705968488124
  (1, 9803)	0.121000823502
  (1, 9838)	0.0699568487191
  (1, 19668)	0.0819419462479
  (1, 4304)	0.164344868929
  (1, 26080)	0.10169418088
  (1, 19243)	0.152092996782
  (1, 18637)	0.152092996782
  (1, 4237)	0.337080962814
  (1, 21798)	0.163969659

# MODEL TRAINING

In [20]:
svm = LinearSVC(C=1.0, loss='l2', class_weight='auto')
svm.fit(X_train, labels_train)
#pickle.dump(svm, open(join(processedFolderPath,modelFilename),'w+'))   

LinearSVC(C=1.0, class_weight='auto', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
     random_state=None, tol=0.0001, verbose=0)

In [21]:
##### LOAD MODEL
#svm = pickle.load( open(join(processedFolderPath,modelFilename), "rb" ))

# MODEL TESTING

In [22]:
##### GET PREDICTIONS
predictions = svm.predict(X_test)

##### GET STATISTICS
report = ""
report += "-----------------------------------------------------\n"
report += "------------ CLASSIFICATOR STATISTICS ---------------\n"
report += "-----------------------------------------------------\n\n"
report += 'Precision:\t%s\n' % sklearn.metrics.precision_score(labels_test, predictions,
                                                                pos_label=labels_test[0])
report += 'Accuracy:\t%s\n\n'% sklearn.metrics.accuracy_score(labels_test, predictions)
report += "-----------------------------------------------------\n"
report += "------------ STATISTICS PER CATEGORY ----------------\n"
report += "-----------------------------------------------------\n\n"
report += classification_report(labels_test, predictions)

print report

-----------------------------------------------------
------------ CLASSIFICATOR STATISTICS ---------------
-----------------------------------------------------

Precision:	0.958823916817
Accuracy:	0.957446808511

-----------------------------------------------------
------------ STATISTICS PER CATEGORY ----------------
-----------------------------------------------------

             precision    recall  f1-score   support

account_statement       0.79      0.85      0.81        13
        cam       1.00      1.00      1.00         6
dating_erotic       0.82      0.88      0.85        16
   discount       0.88      0.88      0.88       121
      ebola       1.00      0.99      0.99       875
loan_nonbank       0.87      0.87      0.87        15
 online_bet       0.81      0.91      0.86        23
   pay_easy       0.76      0.81      0.79        16
   pharmacy       0.74      0.81      0.77        21
soc_network       0.84      0.87      0.85        30
transactional       0.92     

In [23]:
processedCorpus[0]

u'bank owe money claims guys marie start reclaiming mis sold ppi money today owed money mis sold payment protection insurance ppi credit card loan natwest lloyds barclays lender start second application loan credit card lloyds barclays natwest lender millions mis sold ppi policy start second application choose claims guys claimed customers average tcg customer claim charge upfront fees win fee online application request claims guys figure applies lenders claims guys customers fee payable claim cancelled day cooling period reasonable offer lender claims guys lynfield house church street altrincham cheshire registered england company regulated claims management regulator repsect regulated claims management activities crm registration recorded website www gov moj cmr hope enjoyed receiving email longer wish receive emails company visit link unsubscribe claims guys marie start reclaiming mis sold ppi money today owed money mis sold payment protection insurance ppi credit card loan start se

In [24]:
ex = vectorizer.transform([processedCorpus[0]])
ex.toarray()

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [25]:
svm.predict(ex)

array(['loan_nonbank'], 
      dtype='|S17')

In [26]:
svm.predict(vectorizer.transform(["Hello offer price low"]))

array(['discount'], 
      dtype='|S17')

In [27]:
svm.predict(vectorizer.transform(["bank account"]))

array(['ebola'], 
      dtype='|S17')

In [28]:
svm.predict(vectorizer.transform(["account password"]))

array(['transactional'], 
      dtype='|S17')