In [1]:
from __future__ import division
import pandas as pd 
import numpy as np
import scipy
from scipy.sparse import hstack
import gensim
import boto3
import re
import json
import os
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from nltk.corpus import stopwords
import string
from sklearn import linear_model, datasets
from sklearn.externals import joblib
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from sklearn.metrics import precision_recall_fscore_support
from sklearn.decomposition import TruncatedSVD
from sklearn.calibration import CalibratedClassifierCV
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df=pd.read_csv('rdc-catalog-train.tsv',delimiter='\t',encoding='utf-8',header=None)
df.columns = ['text', 'label']

#Isolate target data
X = df["text"].values
X = np.hstack(X)
y = df["label"].values
y = np.hstack(y)

In [3]:
def train_full(classifier, X, y):
    print("X:")
    print(len(X))
    print("y:")
    print(len(y))
    classifier.fit(X, y)
    return classifier


def train_test(classifier, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
    print("X_train:")
    print(len(X_train))
    print("X_test:")
    print(len(X_test))
    print("y_train:")
    print(len(y_train))
    print("y_test:")
    print(len(y_test))
    
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(y_test,y_pred, pos_label=None, average='weighted')
    print("Accuracy: ", classifier.score(X_test, y_test))
    print("Precision: ", weighted_p)
    print("Recall: ", weighted_r)
    print("F1-Score: ", weighted_f1)
    return classifier

In [4]:
def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]

def lemmatization_tokenizer(text):
    wnl = WordNetLemmatizer()
    return [wnl.lemmatize(w) for w in word_tokenize(text)]

In [5]:
def standardize_metrics(text):
    text = re.sub(r'\b[\d\.\/]+\s?(v|volt)\b', 'metricV', text)
    text = re.sub(r'\b[\d\.\/]+\s?(amp|amps|ampere|amperes)\b', 'metricA', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mah|ah|ampere-hour)\b', 'metricAh', text)
    text = re.sub(r'\b[\d\.\/]+\s?(in|inch|inches)\b', 'metricIn', text)
    text = re.sub(r'\b[\d\.\/]+\s?\"', 'metricIn', text)
    text = re.sub(r'\b[\d\.\/]+\s?(gb|gig|go|giga|gigabit|gigabyte)\b', 'metricGb', text)
    text = re.sub(r'\b[\d\.\/]+\s?(oz|ounce)\b', 'metricOz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(fl\.? oz\.?|fluids? ounces?)\b', 'metricFlOz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cwt|quintal)\b', 'metricCwt', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mhz|hz|khz|ghz|gigahertz|megahertz|kilohertz|hertz)\b', 'metricHz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(wh|kwh|watt-hour|kilowatt-hour)\b', 'metricWh', text)
    text = re.sub(r'\b[\d\.\/]+\s?(w|kw|watt|kilowatt)\b', 'metricW', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mf|mfd|mmf|mmfd|microfarad)\b', 'metricMfd', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ft|feet|foot)\b', 'metricFt', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cm|centimeter)\b', 'metricCm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mm|millimeter)\b', 'metricMm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(km|kilometer)\b', 'metricKm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(m|meter)\b', 'metricM', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cell|cells)\b', 'metricCell', text)
    text = re.sub(r'\b[\d\.\/]+\s?(lb|lbs|pound)\b', 'metricLb', text)
    text = re.sub(r'\b[\d\.\/]+\s?(yds|yd|yard|yards)\b', 'metricYd', text)
    text = re.sub(r'\b[\d\.\/]+\s?(pc|pcs|pieces|piece)\b', 'metricPc', text)
    text = re.sub(r'\b[\d\.\/]+\s?(gal|gals|gallon|gallons)\b', 'metricGal', text)
    text = re.sub(r'\b[\d\.\/]+\s?(yd|yds|yard|yards)\b', 'metricYd', text)
    text = re.sub(r'\b[\d\.\/]+\s?(deg|degs|degree|degrees)\b', 'metricDeg', text)
    text = re.sub(r'\b[\d\.\/]+\s?\°', 'metricDeg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(l|liter|liters)\b', 'metricL', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ml|mls|milliliter|milliliters)\b', 'metricMl', text)
    text = re.sub(r'\b[\d\.\/]+\s?(kg|kilograms|kilogram)\b', 'metricKg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(g|grams|gram)\b', 'metricG', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mg|mgs|milligrams|milligram)\b', 'metricMg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(sq|sqs|square|squares)\b', 'metricSq', text)
    text = re.sub(r'\b[\d\.\/]+\s?(pt|pts|pint|pints)\b', 'metricPt', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ohm)\b', 'metricOhm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(fz)\b', 'metricFz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ct|cts|carat|carats)\b', 'metricCt', text)
    text = re.sub(r'\b[\d]+p\b', 'metricRes', text)
    text = re.sub(r'\b[\d]+x[\d]+\b', 'metricRes', text)
    text = re.sub(r'\b[\d]+x\b', 'metricX', text)
    return text

def clean_text_standard_metrics_v0plus(text):
    text = text.lower()
    text = standardize_metrics(text)
    text = re.sub(r'\b\d*\.\d+\b', 'nbDec', text)
    text = re.sub(r'\b\d+\/\d+\b', 'nbFra', text)
    text = re.sub(r'\b\d+\b', 'nbNat', text)
    text = re.sub('\d+', '0', text) 
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

def standardize_char(text):
    text = text.lower()
    text = re.sub(r'\b(\d+\s?x\s?)?\d+(\s|-|\/|\s?per\s?|\s?pc\s?s\/\s?|)pa?cks?\b', 'pck', text)
    text = re.sub(r'\bpa?cks?(\s|-|\sof\s|)\d+\b', 'pck', text)
    text = re.sub(r'\b(\d+\s?x\s?)?\d+(\s|-|\/|\s?per\s?|\s?pc\s?s\/\s?|)sets?\b', 'set', text)
    text = re.sub(r'\bsets?(\s|-|\sof\s|)\d+\b', 'set', text)
    text = re.sub(r'\b[\d\.\/]+\s?(v|volt)\b', 'v', text)
    text = re.sub(r'\b[\d\.\/]+\s?(amp|amps|ampere|amperes)\b', 'a', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mah|ah|ampere-hour)\b', 'ah', text)
    text = re.sub(r'\b[\d\.\/]+\s?(in|inch|inches)\b', 'in', text)
    text = re.sub(r'\b[\d\.\/]+\s?\"', 'in', text)
    text = re.sub(r'\b[\d\.\/]+\s?(gb|gig|go|giga|gigabit|gigabyte)\b', 'gb', text)
    text = re.sub(r'\b[\d\.\/]+\s?(oz|ounce)\b', 'oz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(fl\.? oz\.?|fluids? ounces?)\b', 'floz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cwt|quintal)\b', 'cwt', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mhz|hz|khz|ghz|gigahertz|megahertz|kilohertz|hertz)\b', 'hz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(wh|kwh|watt-hour|kilowatt-hour)\b', 'wh', text)
    text = re.sub(r'\b[\d\.\/]+\s?(w|kw|watt|kilowatt)\b', 'w', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mf|mfd|mmf|mmfd|microfarad)\b', 'mfd', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ft|feet|foot)\b', 'ft', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cm|centimeter)\b', 'cm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mm|millimeter)\b', 'mm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(km|kilometer)\b', 'km', text)
    text = re.sub(r'\b[\d\.\/]+\s?(m|meter)\b', 'm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cell|cells)\b', 'cell', text)
    text = re.sub(r'\b[\d\.\/]+\s?(lb|lbs|pound)\b', 'lb', text)
    text = re.sub(r'\b[\d\.\/]+\s?(yds|yd|yard|yards)\b', 'yd', text)
    text = re.sub(r'\b[\d\.\/]+\s?(pc|pcs|pieces|piece)\b', 'pcs', text)
    text = re.sub(r'\b[\d\.\/]+\s?(gal|gals|gallon|gallons)\b', 'gal', text)
    text = re.sub(r'\b[\d\.\/]+\s?(yd|yds|yard|yards)\b', 'yd', text)
    text = re.sub(r'\b[\d\.\/]+\s?(deg|degs|degree|degrees)\b', 'deg', text)
    text = re.sub(r'\b[\d\.\/]+\s?\°', 'deg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(l|liter|liters)\b', 'l', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ml|mls|milliliter|milliliters)\b', 'ml', text)
    text = re.sub(r'\b[\d\.\/]+\s?(kg|kilograms|kilogram)\b', 'kg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(g|grams|gram)\b', 'g', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mg|mgs|milligrams|milligram)\b', 'mg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(sq|sqs|square|squares)\b', 'sg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(pt|pts|pint|pints)\b', 'pt', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ohm)\b', 'ohm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(fz)\b', 'fz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ct|cts|carat|carats)\b', 'ct', text)
    text = re.sub(r'\b[\d]+p\b', 'res', text)
    text = re.sub(r'\b[\d]+x[\d]+\b', 'res', text)
    text = re.sub(r'\b[\d]+x\b', 'x', text)
    text = re.sub(r'\b\d*\.\d+\b', 'deci', text)
    text = re.sub(r'\b\d+\/\d+\b', 'frac', text)
    text = re.sub(r'\b\d+\b', 'nat', text)
    text = re.sub(r'\b(\w*\d\w*[a-z]+\w*|\w*[a-z]+\w*\d\w*)\b', 'sku', text)
    text = re.sub('\d+', '0', text) 
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

def xtrem_clean_char(text):
    text = text.lower()
    text = re.sub(r'\b(\d+\s?x\s?)?\d+(\s|-|\/|\s?per\s?|\s?pc\s?s\/\s?|)pa?cks?\b', '', text)
    text = re.sub(r'\bpa?cks?(\s|-|\sof\s|)\d+\b', '', text)
    text = re.sub(r'\b(\d+\s?x\s?)?\d+(\s|-|\/|\s?per\s?|\s?pc\s?s\/\s?|)sets?\b', '', text)
    text = re.sub(r'\bsets?(\s|-|\sof\s|)\d+\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(v|volt)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(amp|amps|ampere|amperes)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mah|ah|ampere-hour)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(in|inch|inches)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?\"', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(gb|gig|go|giga|gigabit|gigabyte)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(oz|ounce)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(fl\.? oz\.?|fluids? ounces?)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cwt|quintal)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mhz|hz|khz|ghz|gigahertz|megahertz|kilohertz|hertz)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(wh|kwh|watt-hour|kilowatt-hour)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(w|kw|watt|kilowatt)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mf|mfd|mmf|mmfd|microfarad)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ft|feet|foot)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cm|centimeter)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mm|millimeter)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(km|kilometer)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(m|meter)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cell|cells)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(lb|lbs|pound)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(yds|yd|yard|yards)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(pc|pcs|pieces|piece)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(gal|gals|gallon|gallons)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(yd|yds|yard|yards)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(deg|degs|degree|degrees)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?\°', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(l|liter|liters)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ml|mls|milliliter|milliliters)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(kg|kilograms|kilogram)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(g|grams|gram)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mg|mgs|milligrams|milligram)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(sq|sqs|square|squares)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(pt|pts|pint|pints)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ohm)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(fz)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ct|cts|carat|carats)\b', '', text)
    text = re.sub(r'\b[\d]+p\b', '', text)
    text = re.sub(r'\b[\d]+x[\d]+\b', '', text)
    text = re.sub(r'\b[\d]+x\b', '', text)
    text = re.sub(r'\b\d*\.\d+\b', '', text)
    text = re.sub(r'\b\d+\/\d+\b', '', text)
    text = re.sub(r'\b\d+\b', '', text)
    text = re.sub(r'\b(\w*\d\w*[a-z]+\w*|\w*[a-z]+\w*\d\w*)\b', '', text)
    text = re.sub('\d+', '', text) 
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text



In [6]:
#TEST 26: lemmatization + clean_text_standard_metrics_v0plus preprocessor + sublinear + single letter + min_df 2
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
clf = Pipeline([
    ('vectorizer', TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=0.9, norm='l2', ngram_range=(1, 2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,
                                stop_words=single_letter+list(string.punctuation))),
    ('classifier', LinearSVC(verbose=1))
])

print("start train...")
clf=train_test(clf, X, y)
print("finished train")

start train...
X_train:
600000
X_test:
200000
y_train:
600000
y_test:
200000
[LibLinear]

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Accuracy: 0.817935
Precision: 0.8095020763184998
Recall: 0.817935
F1-Score: 0.8100457644640601
finished train


In [None]:
#TEST 28b: char n gram
clf = Pipeline([
    ('vectorizer', TfidfVectorizer(sublinear_tf=True, min_df=2,norm='l2', ngram_range=(1,4),lowercase=True, analyzer='char',strip_accents='unicode')),
    ('classifier', LinearSVC(verbose=1))
])

print("start train...")
clf=train_test(clf, X, y)
print("finished train")

start train...
X_train:
600000
X_test:
200000
y_train:
600000
y_test:
200000
[LibLinear]Accuracy: 0.820155
Precision: 0.8108338638615838
Recall: 0.820155
F1-Score: 0.8108139486508633
finished train


In [6]:
#TEST 40: char n gram (limited features to 250k)
clf = Pipeline([
    ('vectorizer', TfidfVectorizer(sublinear_tf=True, min_df=2,norm='l2', ngram_range=(1,4),lowercase=True, analyzer='char',strip_accents='unicode',max_features=250000)),
    ('classifier', LinearSVC(verbose=1))
])

print("start train...")
clf=train_test(clf, X, y)
print("finished train")

start train...
X_train:
600000
X_test:
200000
y_train:
600000
y_test:
200000
[LibLinear]

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Accuracy:  0.819765
Precision:  0.810456071770931
Recall:  0.819765
F1-Score:  0.810496372763874
finished train


In [6]:
#TEST 41: word/char limit to 600k features
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
max_features = 600000
clf = Pipeline([
  ('features', FeatureUnion([
    ('characters', Pipeline([
      ('char_tf_idf', TfidfVectorizer(sublinear_tf=True, min_df=2,norm='l2', ngram_range=(1,4),lowercase=True, analyzer='char',strip_accents='unicode',max_features=int(max_features/2)))
    ])),
    ('words', Pipeline([
      ('words_tf_idf', TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=0.9, norm='l2', ngram_range=(1, 2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation),max_features=int(max_features/2)))
    ])),
  ])),
  ('classifier', LinearSVC(verbose=1))
])


print("start train...")
clf=train_test(clf, X, y)
print("finished train")

start train...
X_train:
600000
X_test:
200000
y_train:
600000
y_test:
200000
[LibLinear]

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Accuracy:  0.828935
Precision:  0.8224192086402817
Recall:  0.828935
F1-Score:  0.822521044388522
finished train


In [6]:
#TEST 42: word/char limit to 700k features
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
max_features = 700000
clf = Pipeline([
  ('features', FeatureUnion([
    ('characters', Pipeline([
      ('char_tf_idf', TfidfVectorizer(sublinear_tf=True, min_df=2,norm='l2', ngram_range=(1,4),lowercase=True, analyzer='char',strip_accents='unicode',max_features=int(max_features/2)))
    ])),
    ('words', Pipeline([
      ('words_tf_idf', TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=0.9, norm='l2', ngram_range=(1, 2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation),max_features=int(max_features/2)))
    ])),
  ])),
  ('classifier', LinearSVC(verbose=1))
])


print("start train...")
clf=train_test(clf, X, y)
print("finished train")

start train...
X_train:
600000
X_test:
200000
y_train:
600000
y_test:
200000
[LibLinear]

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Accuracy:  0.829165
Precision:  0.8226888384508295
Recall:  0.829165
F1-Score:  0.822767246106143
finished train


In [7]:
joblib.dump(clf, '/home/rakutenWordCharTest.pkl')

['/home/rakutenWordCharTest.pkl']

In [6]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
max_features = 700000
clf = Pipeline([
  ('features', FeatureUnion([
    ('characters', Pipeline([
      ('char_tf_idf', TfidfVectorizer(sublinear_tf=True, min_df=2,norm='l2', ngram_range=(1,4),lowercase=True, analyzer='char',strip_accents='unicode',max_features=int(max_features/2)))
    ])),
    ('words', Pipeline([
      ('words_tf_idf', TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=0.9, norm='l2', ngram_range=(1, 2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation),max_features=int(max_features/2)))
    ])),
  ])),
  ('classifier', LinearSVC(verbose=1))
])


print("start train...")
clf=train_full(clf, X, y)
print("finished train")

start train...
X:
800000
y:
800000
[LibLinear]finished train


In [7]:
joblib.dump(clf, '/home/rakutenWordCharFull.pkl')

['/home/rakutenWordCharFull.pkl']

In [8]:
clf = joblib.load('/home/rakutenWordCharFull.pkl') 

In [8]:
df=pd.read_csv('./rdc-catalog-test.tsv',delimiter='\t',encoding='utf-8')
df.head()

Unnamed: 0,Title,CategoryIdPath
0,Sterling Silver Dangle Ball Earrings w/ Brilli...,1608>2320>2173>3813
1,ALTERNATOR FREIGHTLINER FL FLC 112 120 FLD 112...,2199>4592>12
2,Disc Brake Rotor-Advanced Technology Rear Rayb...,2199>4592>12
3,Coquette Neon Pink Ruffle Babydoll 7035 Neon P...,1608>4269>3031>1221
4,12V 7Ah (SPS Brand) APC NS3000RMT3U Replacemen...,3292>114>1231


In [9]:
df.values.shape

(200000, 2)

In [23]:
predictions.shape

(200000,)

In [11]:
df = df.drop(columns=['CategoryIdPath'])
df.insert(1,'CategoryIdPath',[pred for pred in predictions])
df.head()

NameError: name 'predictions' is not defined

In [29]:
df.to_csv("submission_test_combined.tsv", sep='\t', encoding='utf-8',index=False,header=False)
sub_df=pd.read_csv("submission_test_combined.tsv",delimiter='\t',encoding='utf-8',header=None)
sub_df.head()

Unnamed: 0,0,1
0,Sterling Silver Dangle Ball Earrings w/ Brilli...,1608>2320>2173>2878
1,ALTERNATOR FREIGHTLINER FL FLC 112 120 FLD 112...,2199>4592>12
2,Disc Brake Rotor-Advanced Technology Rear Rayb...,2199>4592>12
3,Coquette Neon Pink Ruffle Babydoll 7035 Neon P...,1608>4269>3031>1221
4,12V 7Ah (SPS Brand) APC NS3000RMT3U Replacemen...,3292>114>1231


In [12]:
df.shape

(200000, 1)

In [6]:
import collections
c = collections.Counter(y)

In [7]:
df=pd.read_csv('rdc-catalog-train.tsv',delimiter='\t',encoding='utf-8',header=None)
df.columns = ['text', 'label']

In [8]:
y_single = c.most_common()[:-20:-1]

In [9]:
df_filtered = df
for (y_to_delete,_) in y_single:
    df_filtered = df_filtered.ix[~(df['label'] == y_to_delete)]
    print(y_to_delete)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until


2075>3671>4896>3337
4015>2824>2964>2473>4559>1399>2913
3292>2790>1053>2721
2199>1952>3163>1822
3730>2720>4005>2321
4238>2371>1833>113
4238>321>777>4022
3625>3608>971
3730>1439>4712>3507
4564>802>3059>2754
1395>2736>4447>2048>3707
3730>1874>2009>2992
4015>2028>3803>2617
2199>2819>4536
1395>2736>3899>2657>4591
2075>1724>3258>1372>3689
3292>2375>1365>1640
2075>3407>2214>2828
2199>661>909>1069


In [10]:
df_filtered.shape

(799981, 2)

In [11]:
#Isolate target data
df = df_filtered
X = df["text"].values
X = np.hstack(X)
y = df["label"].values
y = np.hstack(y)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

In [13]:
c = collections.Counter(y_train)

In [14]:
y_single = c.most_common()[:-52:-1]

In [15]:
X_train_filtered = X_train
y_train_filtered = y_train
for (y_to_delete,_) in y_single:
    index, = np.where(y_train_filtered == y_to_delete)
    y_train_filtered = np.delete(y_train_filtered, index)
    X_train_filtered = np.delete(X_train_filtered, index)
    print(y_to_delete)

3625>3641>2412>363
1395>2736>4446>1799>3413
4015>3285>345>1585>3448
1608>2227>574>2130
3625>3641>3549>3028
4015>2824>2964>2473>839>2732>1977
3292>2375>4400>3342
2199>661>4818>3006
3292>3581>2023
4015>3754>3580>4753>3650>4100
2075>1724>579>2385>1981
4015>2824>2964>2473>839>2966>1930
4015>870>4606>254
3292>290>580>2908
1395>410>2503>528>991
3292>3581>1878>1337
2199>661>4818>4309
3730>1439>4557
1395>2736>135>4873>941
2075>2267>2180>508
4015>4868>3880
4015>3754>3663>512>2406>533
4015>3285>345>483>2923
3625>3641>1599>3116
3093>4104>3495
2075>3671>956>907>1303
1395>2736>4446>4316>4789
4015>2824>2964>32>2341>1029
2075>3671>700>4621
4564>1265>26>4465>4897
3730>2720>1864>1838
4015>2337>2943>2570>2866
4015>2824>567>1982
4238>1625>3571>2034
4015>2824>2964>2473>839>2966>4511
3625>3641>1599>822
3730>2720>1836>2581
2075>3671>2703>3684
3730>1439>805>4129
3625>3641>2998>2294
1395>410>474>105>8
4238>321>753>4424
2075>3407>2214>605
3625>381>622
2075>3671>396>4126
4015>2824>2964>2886>4183>1530>1904
4015>

In [16]:
y_train_filtered.shape

(599934,)

In [17]:
from sklearn.base import BaseEstimator, TransformerMixin

class ReShaper(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.vars = [] 

    def transform(self, X, y=None):
        return X.reshape(-1,)  # where the actual feature extraction happens

    def fit(self, X, y=None):
        return self  # generally does nothing

In [27]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
max_features = 400000
clf = Pipeline([
  ('features', FeatureUnion([
    ('characters', Pipeline([
      ('char_reshape', ReShaper()),
      ('char_tf_idf', TfidfVectorizer(sublinear_tf=True, min_df=2,norm='l2', ngram_range=(1,4),lowercase=True, analyzer='char',strip_accents='unicode',max_features=int(max_features/2)))
    ])),
    ('words', Pipeline([
      ('char_reshape', ReShaper()),
      ('words_tf_idf', TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=0.9, norm='l2', ngram_range=(1, 2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation),max_features=int(max_features/2)))
    ])),
  ],n_jobs=2)),
  ('classifier', LinearSVC(verbose=1))
])

In [28]:
clf.fit(X_train_filtered.reshape(-1,1), y_train_filtered)

[LibLinear]

Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=2,
       transformer_list=[('characters', Pipeline(memory=None,
     steps=[('char_reshape', ReShaper()), ('char_tf_idf', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='conte...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=1))])

In [29]:
joblib.dump(clf, '/home/rakutenPrefitTest.pkl')

['/home/rakutenPrefitTest.pkl']

In [30]:
y_pred = clf.predict(X_test.reshape(-1,1))
weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(y_test,y_pred, pos_label=None, average='weighted')
print("Precision:", weighted_p)
print("Recall:", weighted_r)
print("F1-Score:", weighted_f1)

OSError: [Errno 12] Cannot allocate memory

In [None]:
model = CalibratedClassifierCV(clf, cv='prefit',method='isotonic')
model.fit(X_train_filtered.reshape(-1,1), y_train_filtered)

In [None]:
import gc
gc.collect()

In [None]:
y_pred = model.predict(X_test.reshape(-1,1))
weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(y_test,y_pred, pos_label=None, average='weighted')
print("Precision:", weighted_p)
print("Recall:", weighted_r)
print("F1-Score:", weighted_f1)