In [3]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas as pd
import numpy as np
# import textblob




In [5]:
trainDF = pd.read_csv(r'train_set.csv',header=0,encoding = "ISO-8859-1" )
trainDF.head()

Unnamed: 0,label,text
0,85389000,pdscpm gb part of panel of chiller
1,85389000,nm p economical extended rot hand parts for c...
2,85389000,lv ma pd trip unit for cvs parts of circuit br...
3,85389000,lv na p trip unit for cvs switch parts of circ...
4,85389000,lv tmd pd trip unit for cvs parts of circuitbr...


In [6]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [7]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [8]:

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 



In [9]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

In [10]:
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("SVM, N-Gram Vectors: ", accuracy)

SVM, N-Gram Vectors:  0.8306233062330624


In [11]:
classifier=svm.SVC()
classifier.fit(xtrain_tfidf_ngram, train_y)

# predict the labels on validation dataset
predictions = classifier.predict(xvalid_tfidf_ngram)



In [22]:
## Test_set

In [27]:
unique_ele=trainDF.label.unique()
dic={}
for i,l in enumerate(unique_ele):
    dic[i]=l
print(dic)

{0: 85389000, 1: 85177090, 2: 85369090, 3: 39269099, 4: 73181500, 5: 85366990, 6: 85364900, 7: 85238090, 8: 87089900, 9: 33041000, 10: 87082900, 11: 84713010}


In [28]:
test1= pd.read_csv(r'test_set.csv',header=0,encoding = "ISO-8859-1")

In [29]:
tfidf_vect_ngram_test = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)

tfidf_vect_ngram_test.fit(test1['text'])
xtest_tfidf_ngram =  tfidf_vect_ngram_test.transform(test1.text)
predictions = classifier.predict(xtest_tfidf_ngram)

In [35]:

pred=[]
for i in predictions:
    pred.append(dic[i])
# print(pred)


list1=[]
for i,j in zip(pred,test1.text):
    list1.append([i,j])
    
#     pred_data.append(pd.DataFrame([i,j]))
# print(pred_data.head())
# print(list1[:10])

pred_data = pd.DataFrame(list1,columns=['lable','text'])
print(pred_data.head())

      lable                                               text
0  85177090  lv tmd pd trip unit for nh parts of circuit br...
1  85177090  module tm analog outputs analog output expansi...
2  33041000   command group t iii mechanismt p parts forcir...
3  33041000    parts of relayelectrical contact  issu e f xxup
4  85177090  parts for programmable logic controllers  dm  ...


In [36]:
pred_data.to_csv('output.csv',index=False)