In [1]:
import pandas as pd
import numpy as np
from numpy import zeros

In [2]:
#sklearn imports
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn import cross_validation
from sklearn.metrics import classification_report

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score



In [3]:
#load data
data = pd.read_csv('train_set.csv', sep="\t")

In [4]:
#create set from categories
le = preprocessing.LabelEncoder()
le.fit(data["Category"])
y = le.transform(data["Category"])
set(y)

{0, 1, 2, 3, 4}

In [5]:
#import pipeline
from sklearn.pipeline import make_pipeline

In [6]:
#import vectorizer and lsi
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [7]:
#do vectorization
vectorizer = TfidfVectorizer(max_df=0.5,stop_words='english',use_idf=True)
X = vectorizer.fit_transform(data['Content'])
lsa=TruncatedSVD(n_components=100)
X=lsa.fit_transform(X)
X=preprocessing.Normalizer(copy=False).fit_transform(X)

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [9]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

In [10]:
clf=make_pipeline(preprocessing.StandardScaler(),GridSearchCV(SVC(), tuned_parameters, n_jobs=-1, cv=10))

In [11]:
clf=GridSearchCV(SVC(), tuned_parameters, n_jobs=-1,cv=10)

In [12]:
clf.fit(X, y)

GridSearchCV(cv=10, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'kernel': ['rbf'], 'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}, {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [13]:
clf.best_params_


{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}

In [38]:
model=make_pipeline(preprocessing.StandardScaler(), SVC(**clf.best_params_))

In [39]:
#run model with 10_fold
predicted= cross_val_predict(model, X, y, cv=10)

In [40]:
predicted

array([0, 0, 0, ..., 2, 0, 4])

In [41]:
#import for ignore the famous error
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [42]:
#inverse predicted
predicted_categories = le.inverse_transform(predicted)
predicted_categories

array(['Business', 'Business', 'Business', ..., 'Football', 'Business',
       'Technology'], dtype=object)

In [43]:
#import for metrics
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [44]:
#get metrics for precision,recall,fscore,support in a list
score=precision_recall_fscore_support(y,predicted,average='macro')
precision=score[0]
recall=score[1]
f1_score=score[2]

In [45]:
#get metrics for accuracy
accuracy=accuracy_score(y,predicted)

In [46]:
#open csv for write results
data = pd.read_csv('EvaluationMetric_10fold.csv',sep='\t')

In [47]:
#write results to csv
data.ix[0,'SVM']=accuracy
data.ix[1,'SVM']=precision
data.ix[2,'SVM']=recall
data.ix[3,'SVM']=f1_score
#save/overwrite csv
data.to_csv('EvaluationMetric_10fold.csv',sep='\t',index=False)

In [48]:
data

Unnamed: 0,Statistic Measure,Naive Bayes,Random Forest,SVM,KNN,My Method
0,Accuracy,0.939426,0.678787,0.959237,,
1,Precision,0.935177,0.648057,0.956933,,
2,Recall,0.937824,0.64748,0.957114,,
3,F-Measure,0.936162,0.647525,0.957014,,
