<a href="https://colab.research.google.com/github/yugant10-commits/docsumo/blob/master/TextClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [56]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

In [53]:
#Function to read csv with particulur encoding and remove duplicate rows.

def read_pipeline(file_name):
  df = pd.read_csv(file_name, encoding='unicode_escape')
  clean_df = df.drop_duplicates(subset = 'text',
                     keep = 'last')
  return clean_df


In [66]:
train_df = read_pipeline('train_set.csv')
train_df.head()

Unnamed: 0,label,text
0,85389000,pdscpm gb part of panel of chiller
1,85389000,nm p economical extended rot hand parts for c...
5,85389000,sleeve joint ip kseya
10,85389000,small controller panel for process controller...
11,85389000,a compact fuseholder nwcmf electrical fuse lin...


In [68]:

pd.value_counts(train_df['label'])

39269099    1624
85177090    1607
85389000    1556
73181500    1274
85369090    1259
85366990    1034
87089900     786
84713010     564
87082900     549
33041000     547
85364900     455
85238090      93
Name: label, dtype: int64

In [55]:
train_df.isnull().sum()

label    0
text     0
dtype: int64

In [38]:
text_pipeline = Pipeline([
                          ('tfidf', TfidfVectorizer()),
                          ('smote', SMOTE(random_state=42)), 
                          ('mnb', MultinomialNB())
])

In [60]:
x_train, x_test, y_train, y_test = train_test_split(train_df['text'],
                                                    train_df['label'],
                                                    test_size = 0.2,
                                                    )

In [None]:
model = text_pipeline.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [62]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy on test dataset:', accuracy)

Accuracy on test dataset: 0.8004405286343612


In [65]:
test_df = read_pipeline('test_set.csv')
test_df.head()

Unnamed: 0,text
3,parts of relayelectrical contact issu e f xxup
4,parts for programmable logic controllers dm ...
6,nmsp ext rotary handle mech parts for circuit ...
8,element sh electrical fuse links actual user
9,uxab ra kit connec xk xsuppaux xprrce parts fo...


In [70]:
y_pred = model.predict(test_df['text'])


array([85389000, 85389000, 85389000, ..., 84713010, 84713010, 84713010])

In [73]:
test_pred_df = pd.DataFrame(y_pred, columns = ['Predicted Label'])
test_pred_df['text'] = test_df['text']

In [75]:
test_pred_df.to_csv('prediction_submission.csv')