In [1]:
import lxml
import json
import copy
import string
import warnings
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.svm import SVC
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn import decomposition
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
%matplotlib inline
warnings.filterwarnings("ignore")

In [2]:
# !pip install bs4

In [3]:
Labels = ['country', 'id', 'title', 'c1', 'c2', 'c3', 'description', 'price', 'type']

In [5]:
df = pd.read_csv('data_train.csv',names = Labels)

In [6]:
def extract_fet_class(df):
    
    df.drop(['country','id','price','type'],inplace=True,axis=1)
    df['c3'] = df['c3'].astype(str) #removing all the non string
    Y1 = df['c1']
    Y2 = df['c2']
    Y3 = df['c3']
    X = df.drop(['c1','c2','c3'],axis=1)
    X = X['title'] +' '+ X['description']
    
    return X, Y1,Y2,Y3

In [7]:
def replace_all(text, sym=['<ul>', '</ul>', '<li>', '</li>']):
    
        for i in sym:
            text = text.replace(i, ' ')

        textSec = []
        for i in text.split(' '):
            textTh = []
            for j in i:
                textTh.append(j if ord(j) < 128 else ' ')
            textTh = ''.join(textTh).strip().split(' ')

            for t in textTh:
                if t != '':
                    textSec.append(t)

        return ' '.join(textSec)

In [8]:
def remove_puncs(text):
    
        punctuations = '''!()–+=-[]{};:'"\,<>./?@#$%^&*_~'''
        no_punct = ""
        for char in text:
            if char not in punctuations:
                no_punct += char

        return no_punct

In [9]:
def PreProcess(df, cols=['title','description']):
    
        for col in cols:
            df[col].fillna('', inplace=True)
            df[col] = df[col].apply(lambda x: replace_all(x))
            if col == 'description':
                df[col] = df[col].apply(lambda x: BeautifulSoup(x, "html.parser").text)
            df[col] = df[col].apply(lambda x: remove_puncs(x))

In [10]:
def cleaning_data(df, tfidf_vectorizer, tfidf_vectorizer_train=None):
    
    PreProcess(df)
    X, Y1, Y2, Y3 = extract_fet_class(df)
    X_tfidf = tfidf_vectorizer.fit_transform(X)
    
    if tfidf_vectorizer_train:
    
        corpus_vocabulary = defaultdict(None, copy.deepcopy(tfidf_vectorizer_train.vocabulary_))
        corpus_vocabulary.default_factory = corpus_vocabulary.__len__
    
        for word in tfidf_vectorizer.vocabulary_.keys():
            if word in tfidf_vectorizer_train.vocabulary_:
                corpus_vocabulary[word]
                
        tfidf_vectorizer = TfidfVectorizer(vocabulary=corpus_vocabulary)
        X_tfidf = tfidf_vectorizer.fit_transform(X)
    
    return X_tfidf, Y1, Y2, Y3

In [11]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf, Y1, Y2, Y3 = cleaning_data(df, tfidf_vectorizer)

In [12]:
x_train, x_test, y_train, y_test = train_test_split(X_train_tfidf, Y3, test_size=0.3, random_state=42)

In [13]:
param_grid = {'penalty': ['l1', 'l2'], 'multi_class':['ovr', 'crammer_singer']}

In [None]:
grid = GridSearchCV(svm.LinearSVC(),param_grid,refit=True,verbose=2,n_jobs=4)
grid.fit(x_train,y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
print(grid.best_estimator_)