In [1]:
from __future__ import division
import pandas as pd 
import numpy as np
import scipy
from scipy.sparse import hstack
import gensim
import boto3
import re
import json
import os
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from nltk.corpus import stopwords
import string
from sklearn import linear_model, datasets
from sklearn.externals import joblib
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from sklearn.metrics import precision_recall_fscore_support
from sklearn.decomposition import TruncatedSVD
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df=pd.read_csv('rdc-catalog-train.tsv',delimiter='\t',encoding='utf-8',header=None)
df.columns = ['text', 'label']

#Isolate target data
X = df["text"].values
X = np.hstack(X)
y = df["label"].values
y = np.hstack(y)

In [3]:
def train_full(classifier, X, y):
    print("X:")
    print(len(X))
    print("y:")
    print(len(y))
    classifier.fit(X, y)
    return classifier


def train_test(classifier, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
    print("X_train:")
    print(len(X_train))
    print("X_test:")
    print(len(X_test))
    print("y_train:")
    print(len(y_train))
    print("y_test:")
    print(len(y_test))
    
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(y_test,y_pred, pos_label=None, average='weighted')
    print("Accuracy: ", classifier.score(X_test, y_test))
    print("Precision: ", weighted_p)
    print("Recall: ", weighted_r)
    print("F1-Score: ", weighted_f1)
    return classifier

In [4]:
def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]

def lemmatization_tokenizer(text):
    wnl = WordNetLemmatizer()
    return [wnl.lemmatize(w) for w in word_tokenize(text)]

In [5]:
def standardize_metrics(text):
    text = re.sub(r'\b[\d\.\/]+\s?(v|volt)\b', 'metricV', text)
    text = re.sub(r'\b[\d\.\/]+\s?(amp|amps|ampere|amperes)\b', 'metricA', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mah|ah|ampere-hour)\b', 'metricAh', text)
    text = re.sub(r'\b[\d\.\/]+\s?(in|inch|inches)\b', 'metricIn', text)
    text = re.sub(r'\b[\d\.\/]+\s?\"', 'metricIn', text)
    text = re.sub(r'\b[\d\.\/]+\s?(gb|gig|go|giga|gigabit|gigabyte)\b', 'metricGb', text)
    text = re.sub(r'\b[\d\.\/]+\s?(oz|ounce)\b', 'metricOz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(fl\.? oz\.?|fluids? ounces?)\b', 'metricFlOz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cwt|quintal)\b', 'metricCwt', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mhz|hz|khz|ghz|gigahertz|megahertz|kilohertz|hertz)\b', 'metricHz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(wh|kwh|watt-hour|kilowatt-hour)\b', 'metricWh', text)
    text = re.sub(r'\b[\d\.\/]+\s?(w|kw|watt|kilowatt)\b', 'metricW', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mf|mfd|mmf|mmfd|microfarad)\b', 'metricMfd', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ft|feet|foot)\b', 'metricFt', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cm|centimeter)\b', 'metricCm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mm|millimeter)\b', 'metricMm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(km|kilometer)\b', 'metricKm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(m|meter)\b', 'metricM', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cell|cells)\b', 'metricCell', text)
    text = re.sub(r'\b[\d\.\/]+\s?(lb|lbs|pound)\b', 'metricLb', text)
    text = re.sub(r'\b[\d\.\/]+\s?(yds|yd|yard|yards)\b', 'metricYd', text)
    text = re.sub(r'\b[\d\.\/]+\s?(pc|pcs|pieces|piece)\b', 'metricPc', text)
    text = re.sub(r'\b[\d\.\/]+\s?(gal|gals|gallon|gallons)\b', 'metricGal', text)
    text = re.sub(r'\b[\d\.\/]+\s?(yd|yds|yard|yards)\b', 'metricYd', text)
    text = re.sub(r'\b[\d\.\/]+\s?(deg|degs|degree|degrees)\b', 'metricDeg', text)
    text = re.sub(r'\b[\d\.\/]+\s?\°', 'metricDeg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(l|liter|liters)\b', 'metricL', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ml|mls|milliliter|milliliters)\b', 'metricMl', text)
    text = re.sub(r'\b[\d\.\/]+\s?(kg|kilograms|kilogram)\b', 'metricKg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(g|grams|gram)\b', 'metricG', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mg|mgs|milligrams|milligram)\b', 'metricMg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(sq|sqs|square|squares)\b', 'metricSq', text)
    text = re.sub(r'\b[\d\.\/]+\s?(pt|pts|pint|pints)\b', 'metricPt', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ohm)\b', 'metricOhm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(fz)\b', 'metricFz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ct|cts|carat|carats)\b', 'metricCt', text)
    text = re.sub(r'\b[\d]+p\b', 'metricRes', text)
    text = re.sub(r'\b[\d]+x[\d]+\b', 'metricRes', text)
    text = re.sub(r'\b[\d]+x\b', 'metricX', text)
    return text

def clean_text_standard_metrics_v0plus(text):
    text = text.lower()
    text = standardize_metrics(text)
    text = re.sub(r'\b\d*\.\d+\b', 'nbDec', text)
    text = re.sub(r'\b\d+\/\d+\b', 'nbFra', text)
    text = re.sub(r'\b\d+\b', 'nbNat', text)
    text = re.sub('\d+', '0', text) 
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

def standardize_char(text):
    text = text.lower()
    text = re.sub(r'\b(\d+\s?x\s?)?\d+(\s|-|\/|\s?per\s?|\s?pc\s?s\/\s?|)pa?cks?\b', 'pck', text)
    text = re.sub(r'\bpa?cks?(\s|-|\sof\s|)\d+\b', 'pck', text)
    text = re.sub(r'\b(\d+\s?x\s?)?\d+(\s|-|\/|\s?per\s?|\s?pc\s?s\/\s?|)sets?\b', 'set', text)
    text = re.sub(r'\bsets?(\s|-|\sof\s|)\d+\b', 'set', text)
    text = re.sub(r'\b[\d\.\/]+\s?(v|volt)\b', 'v', text)
    text = re.sub(r'\b[\d\.\/]+\s?(amp|amps|ampere|amperes)\b', 'a', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mah|ah|ampere-hour)\b', 'ah', text)
    text = re.sub(r'\b[\d\.\/]+\s?(in|inch|inches)\b', 'in', text)
    text = re.sub(r'\b[\d\.\/]+\s?\"', 'in', text)
    text = re.sub(r'\b[\d\.\/]+\s?(gb|gig|go|giga|gigabit|gigabyte)\b', 'gb', text)
    text = re.sub(r'\b[\d\.\/]+\s?(oz|ounce)\b', 'oz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(fl\.? oz\.?|fluids? ounces?)\b', 'floz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cwt|quintal)\b', 'cwt', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mhz|hz|khz|ghz|gigahertz|megahertz|kilohertz|hertz)\b', 'hz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(wh|kwh|watt-hour|kilowatt-hour)\b', 'wh', text)
    text = re.sub(r'\b[\d\.\/]+\s?(w|kw|watt|kilowatt)\b', 'w', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mf|mfd|mmf|mmfd|microfarad)\b', 'mfd', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ft|feet|foot)\b', 'ft', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cm|centimeter)\b', 'cm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mm|millimeter)\b', 'mm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(km|kilometer)\b', 'km', text)
    text = re.sub(r'\b[\d\.\/]+\s?(m|meter)\b', 'm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cell|cells)\b', 'cell', text)
    text = re.sub(r'\b[\d\.\/]+\s?(lb|lbs|pound)\b', 'lb', text)
    text = re.sub(r'\b[\d\.\/]+\s?(yds|yd|yard|yards)\b', 'yd', text)
    text = re.sub(r'\b[\d\.\/]+\s?(pc|pcs|pieces|piece)\b', 'pcs', text)
    text = re.sub(r'\b[\d\.\/]+\s?(gal|gals|gallon|gallons)\b', 'gal', text)
    text = re.sub(r'\b[\d\.\/]+\s?(yd|yds|yard|yards)\b', 'yd', text)
    text = re.sub(r'\b[\d\.\/]+\s?(deg|degs|degree|degrees)\b', 'deg', text)
    text = re.sub(r'\b[\d\.\/]+\s?\°', 'deg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(l|liter|liters)\b', 'l', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ml|mls|milliliter|milliliters)\b', 'ml', text)
    text = re.sub(r'\b[\d\.\/]+\s?(kg|kilograms|kilogram)\b', 'kg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(g|grams|gram)\b', 'g', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mg|mgs|milligrams|milligram)\b', 'mg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(sq|sqs|square|squares)\b', 'sg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(pt|pts|pint|pints)\b', 'pt', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ohm)\b', 'ohm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(fz)\b', 'fz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ct|cts|carat|carats)\b', 'ct', text)
    text = re.sub(r'\b[\d]+p\b', 'res', text)
    text = re.sub(r'\b[\d]+x[\d]+\b', 'res', text)
    text = re.sub(r'\b[\d]+x\b', 'x', text)
    text = re.sub(r'\b\d*\.\d+\b', 'deci', text)
    text = re.sub(r'\b\d+\/\d+\b', 'frac', text)
    text = re.sub(r'\b\d+\b', 'nat', text)
    text = re.sub(r'\b(\w*\d\w*[a-z]+\w*|\w*[a-z]+\w*\d\w*)\b', 'sku', text)
    text = re.sub('\d+', '0', text) 
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

def xtrem_clean_char(text):
    text = text.lower()
    text = re.sub(r'\b(\d+\s?x\s?)?\d+(\s|-|\/|\s?per\s?|\s?pc\s?s\/\s?|)pa?cks?\b', '', text)
    text = re.sub(r'\bpa?cks?(\s|-|\sof\s|)\d+\b', '', text)
    text = re.sub(r'\b(\d+\s?x\s?)?\d+(\s|-|\/|\s?per\s?|\s?pc\s?s\/\s?|)sets?\b', '', text)
    text = re.sub(r'\bsets?(\s|-|\sof\s|)\d+\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(v|volt)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(amp|amps|ampere|amperes)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mah|ah|ampere-hour)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(in|inch|inches)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?\"', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(gb|gig|go|giga|gigabit|gigabyte)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(oz|ounce)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(fl\.? oz\.?|fluids? ounces?)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cwt|quintal)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mhz|hz|khz|ghz|gigahertz|megahertz|kilohertz|hertz)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(wh|kwh|watt-hour|kilowatt-hour)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(w|kw|watt|kilowatt)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mf|mfd|mmf|mmfd|microfarad)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ft|feet|foot)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cm|centimeter)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mm|millimeter)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(km|kilometer)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(m|meter)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cell|cells)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(lb|lbs|pound)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(yds|yd|yard|yards)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(pc|pcs|pieces|piece)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(gal|gals|gallon|gallons)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(yd|yds|yard|yards)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(deg|degs|degree|degrees)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?\°', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(l|liter|liters)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ml|mls|milliliter|milliliters)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(kg|kilograms|kilogram)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(g|grams|gram)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mg|mgs|milligrams|milligram)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(sq|sqs|square|squares)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(pt|pts|pint|pints)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ohm)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(fz)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ct|cts|carat|carats)\b', '', text)
    text = re.sub(r'\b[\d]+p\b', '', text)
    text = re.sub(r'\b[\d]+x[\d]+\b', '', text)
    text = re.sub(r'\b[\d]+x\b', '', text)
    text = re.sub(r'\b\d*\.\d+\b', '', text)
    text = re.sub(r'\b\d+\/\d+\b', '', text)
    text = re.sub(r'\b\d+\b', '', text)
    text = re.sub(r'\b(\w*\d\w*[a-z]+\w*|\w*[a-z]+\w*\d\w*)\b', '', text)
    text = re.sub('\d+', '', text) 
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text



In [6]:
#TEST 26: lemmatization + clean_text_standard_metrics_v0plus preprocessor + sublinear + single letter + min_df 2
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
clf = Pipeline([
    ('vectorizer', TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=0.9, norm='l2', ngram_range=(1, 2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,
                                stop_words=single_letter+list(string.punctuation))),
    ('classifier', LinearSVC(verbose=1))
])

print("start train...")
clf=train_test(clf, X, y)
print("finished train")

start train...
X_train:
600000
X_test:
200000
y_train:
600000
y_test:
200000
[LibLinear]

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Accuracy: 0.817935
Precision: 0.8095020763184998
Recall: 0.817935
F1-Score: 0.8100457644640601
finished train


In [None]:
#TEST 28b: char n gram
clf = Pipeline([
    ('vectorizer', TfidfVectorizer(sublinear_tf=True, min_df=2,norm='l2', ngram_range=(1,4),lowercase=True, analyzer='char',strip_accents='unicode')),
    ('classifier', LinearSVC(verbose=1))
])

print("start train...")
clf=train_test(clf, X, y)
print("finished train")

start train...
X_train:
600000
X_test:
200000
y_train:
600000
y_test:
200000
[LibLinear]Accuracy: 0.820155
Precision: 0.8108338638615838
Recall: 0.820155
F1-Score: 0.8108139486508633
finished train


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

In [7]:
#TEST 50: no lower case 
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
max_features = 700000
clf = Pipeline([
  ('features', FeatureUnion([
    ('characters', Pipeline([
      ('char_tf_idf', TfidfVectorizer(sublinear_tf=True, min_df=2,norm='l2', ngram_range=(1,4),lowercase=False, analyzer='char',strip_accents='unicode',max_features=int(max_features/2)))
    ])),
    ('words', Pipeline([
      ('words_tf_idf', TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=0.9, norm='l2', ngram_range=(1, 2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation),max_features=int(max_features/2)))
    ])),
  ])),
  ('classifier', LinearSVC(verbose=1))
])

print("start train...")
clf=train_test(clf, X, y)
print("finished train")

start train...
X_train:
600000
X_test:
200000
y_train:
600000
y_test:
200000
[LibLinear]

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Accuracy:  0.833645
Precision:  0.8272737278177588
Recall:  0.833645
F1-Score:  0.8272507776722227
finished train


In [8]:
def get_text_length(x):
    return np.array([len(t) for t in x]).reshape(-1, 1)

In [9]:
def get_nb_spaces(x):
    return np.array([len(re.findall("(\s+)",t)) for t in x]).reshape(-1, 1)

In [10]:
def get_nb_numbers(x):
    return np.array([len(re.findall("(\d+)",t)) for t in x]).reshape(-1, 1)

In [11]:
def get_nb_special_chars(x):
    return np.array([len(re.findall("(\W+)",t)) for t in x]).reshape(-1, 1)

In [13]:
from sklearn.preprocessing import FunctionTransformer

In [None]:
#TEST 51: no lower case  + additional features
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
max_features = 700000
clf = Pipeline([
  ('features', FeatureUnion([
    ('characters', Pipeline([
      ('char_tf_idf', TfidfVectorizer(sublinear_tf=True, min_df=2,norm='l2', ngram_range=(1,4),lowercase=False, analyzer='char',strip_accents='unicode',max_features=int(max_features/2)))
    ])),
    ('words', Pipeline([
      ('words_tf_idf', TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=0.9, norm='l2', ngram_range=(1, 2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation),max_features=int(max_features/2)))
    ])),
    ('length', Pipeline([
        ('count', FunctionTransformer(get_text_length, validate=False)),
    ])),
    ('spaces', Pipeline([
        ('count', FunctionTransformer(get_nb_spaces, validate=False)),
    ])),
    ('numbers', Pipeline([
        ('count', FunctionTransformer(get_nb_numbers, validate=False)),
    ])),
    ('special_chars', Pipeline([
        ('count', FunctionTransformer(get_nb_special_chars, validate=False)),
    ]))
  ])),
  ('classifier', LinearSVC(verbose=1))
])

print("start train...")
clf=train_test(clf, X, y)
print("finished train")

start train...
X_train:
600000
X_test:
200000
y_train:
600000
y_test:
200000
[LibLinear]

In [6]:
#TEST 50: no lower case 
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
max_features = 700000
clf = Pipeline([
  ('features', FeatureUnion([
    ('characters', Pipeline([
      ('char_tf_idf', TfidfVectorizer(sublinear_tf=True, min_df=2,norm='l2', ngram_range=(1,4),lowercase=False, analyzer='char',strip_accents='unicode',max_features=int(max_features/2)))
    ])),
    ('words', Pipeline([
      ('words_tf_idf', TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=0.9, norm='l2', ngram_range=(1, 2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation),max_features=int(max_features/2)))
    ])),
  ])),
  ('classifier', LinearSVC(verbose=1))
])

print("start train...")
clf=train_full(clf, X, y)
print("finished train")

start train...
X:
800000
y:
800000
[LibLinear]finished train


In [7]:
joblib.dump(clf, '/home/rakutenModelFinal.pkl')

['/home/rakutenModelFinal.pkl']

In [8]:
clf = joblib.load('/home/rakutenModelFinal.pkl') 

In [9]:
df=pd.read_csv('./rdc-catalog-test.tsv',delimiter='\t',encoding='utf-8')
df.head()
df.values.shape

(200000, 2)

In [21]:
df = df.drop(['CategoryIdPath'],axis=1)
df.values

array([['Sterling Silver Dangle Ball Earrings w/ Brilliant Cut CZ Stones & Yellow Topaz-colored Crystal Balls, 1" (26 mm) tall'],
       ['ALTERNATOR FREIGHTLINER FL FLC 112 120 FLD 112 120 M2 BY 110-555PRM 110-555P 110-910 110-555PRM'],
       ['Disc Brake Rotor-Advanced Technology Rear Raybestos 980368'],
       ...,
       ['8-Pack Replacement Engine Air Filter for 2003 Ford Mustang V8 4.6 Car/Automotive'],
       ['WALTER F4253.B32.100.Z05.12 Indexable Mill Cutter, F4253B32100Z0512'],
       ['Skin for Microsoft Surface Pro (2017) 12.3" - Deer Hunter| MightySkins Protective, Durable, and Unique Vinyl Decal wrap cover  | Easy To Apply, Remove, and Change Styles | Made in the USA']],
      dtype=object)

In [22]:
values=np.hstack(df.values)
predictions=clf.predict(values)

In [23]:
predictions.shape

(200000,)

In [24]:
df.insert(1,'CategoryIdPath',[pred for pred in predictions])
df.head()

Unnamed: 0,Title,CategoryIdPath
0,Sterling Silver Dangle Ball Earrings w/ Brilli...,1608>2320>2173>2878
1,ALTERNATOR FREIGHTLINER FL FLC 112 120 FLD 112...,2199>4592>12
2,Disc Brake Rotor-Advanced Technology Rear Rayb...,2199>4592>12
3,Coquette Neon Pink Ruffle Babydoll 7035 Neon P...,1608>4269>3031>1221
4,12V 7Ah (SPS Brand) APC NS3000RMT3U Replacemen...,3292>114>1231


In [25]:
df.to_csv("submission_test_final.tsv", sep='\t', encoding='utf-8',index=False,header=False)
sub_df=pd.read_csv("submission_test_final.tsv",delimiter='\t',encoding='utf-8',header=None)
sub_df.head()

Unnamed: 0,0,1
0,Sterling Silver Dangle Ball Earrings w/ Brilli...,1608>2320>2173>2878
1,ALTERNATOR FREIGHTLINER FL FLC 112 120 FLD 112...,2199>4592>12
2,Disc Brake Rotor-Advanced Technology Rear Rayb...,2199>4592>12
3,Coquette Neon Pink Ruffle Babydoll 7035 Neon P...,1608>4269>3031>1221
4,12V 7Ah (SPS Brand) APC NS3000RMT3U Replacemen...,3292>114>1231
