In [1]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv("../input/train.csv")
df.head()
df_test = pd.read_csv("../input/test.csv")

In [3]:
X = np.array(df['title'])
y = np.array(df['Category'])

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=53)
my_tags = [str(tag) for tag in set(y_test)]

In [5]:
model = Pipeline([('vectorizer', CountVectorizer(min_df=2,max_features=None,analyzer='word',token_pattern=r'\w{1,}',ngram_range=(1,3))),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf', OneVsRestClassifier(LinearSVC(C=1)))])

In [6]:
#fit model with training data
model.fit(X_train, y_train)
#evaluation on test data
pred = model.predict(X_test)
print('accuracy %s' % accuracy_score(y_test,pred))
print(classification_report(y_test, pred,target_names=my_tags))

accuracy 0.7483851024851921
              precision    recall  f1-score   support

           0       0.65      0.47      0.55      1217
           1       0.79      0.65      0.71      9469
           2       0.85      0.89      0.87      3825
           3       0.87      0.90      0.89     26932
           4       0.72      0.76      0.74     14146
           5       0.77      0.81      0.79     18101
           6       0.42      0.47      0.44       654
           7       0.79      0.75      0.77      3863
           8       0.62      0.66      0.64      1972
           9       0.76      0.55      0.64      2765
          10       0.72      0.70      0.71       358
          11       0.66      0.63      0.64      1336
          12       0.79      0.91      0.85      7243
          13       0.67      0.61      0.64       961
          14       0.43      0.27      0.33       870
          15       0.52      0.20      0.29       198
          16       0.60      0.27      0.37       735

In [7]:
from tqdm import tqdm
infile = open("predictions.csv",'w+')
infile.write('itemid,Category\n')

for i in tqdm(range(len(df_test))):
    a = df_test["title"][i]
    b = model.predict([a])[0]
    infile.write(str(df_test["itemid"][i]))
    infile.write(',')
    infile.write(str(b))
    infile.write('\n')
    
print("done")
infile.close()

100%|██████████| 172402/172402 [11:37<00:00, 247.05it/s]

done



