In [1]:
from __future__ import division
import pandas as pd 
import numpy as np
import scipy
from scipy.sparse import hstack
import gensim
import boto3
import re
import json
import os
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from nltk.corpus import stopwords
import string
from sklearn import linear_model, datasets
from sklearn.externals import joblib
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from sklearn.metrics import precision_recall_fscore_support
from sklearn.decomposition import TruncatedSVD
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df=pd.read_csv('rdc-catalog-train.tsv',delimiter='\t',encoding='utf-8',header=None)
df.columns = ['text', 'label']

#Isolate target data
X = df["text"].values
X = np.hstack(X)
y = df["label"].values
y = np.hstack(y)

In [3]:
def train_full(classifier, X, y):
    print("X:")
    print(len(X))
    print("y:")
    print(len(y))
    classifier.fit(X, y)
    return classifier


def train_test(classifier, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
    print("X_train:")
    print(len(X_train))
    print("X_test:")
    print(len(X_test))
    print("y_train:")
    print(len(y_train))
    print("y_test:")
    print(len(y_test))
    
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(y_test,y_pred, pos_label=None, average='weighted')
    print("Accuracy: ", classifier.score(X_test, y_test))
    print("Precision: ", weighted_p)
    print("Recall: ", weighted_r)
    print("F1-Score: ", weighted_f1)
    return classifier

In [4]:
def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]

def lemmatization_tokenizer(text):
    wnl = WordNetLemmatizer()
    return [wnl.lemmatize(w) for w in word_tokenize(text)]

In [5]:
def standardize_metrics(text):
    text = re.sub(r'\b[\d\.\/]+\s?(v|volt)\b', 'metricV', text)
    text = re.sub(r'\b[\d\.\/]+\s?(amp|amps|ampere|amperes)\b', 'metricA', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mah|ah|ampere-hour)\b', 'metricAh', text)
    text = re.sub(r'\b[\d\.\/]+\s?(in|inch|inches)\b', 'metricIn', text)
    text = re.sub(r'\b[\d\.\/]+\s?\"', 'metricIn', text)
    text = re.sub(r'\b[\d\.\/]+\s?(gb|gig|go|giga|gigabit|gigabyte)\b', 'metricGb', text)
    text = re.sub(r'\b[\d\.\/]+\s?(oz|ounce)\b', 'metricOz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(fl\.? oz\.?|fluids? ounces?)\b', 'metricFlOz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cwt|quintal)\b', 'metricCwt', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mhz|hz|khz|ghz|gigahertz|megahertz|kilohertz|hertz)\b', 'metricHz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(wh|kwh|watt-hour|kilowatt-hour)\b', 'metricWh', text)
    text = re.sub(r'\b[\d\.\/]+\s?(w|kw|watt|kilowatt)\b', 'metricW', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mf|mfd|mmf|mmfd|microfarad)\b', 'metricMfd', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ft|feet|foot)\b', 'metricFt', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cm|centimeter)\b', 'metricCm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mm|millimeter)\b', 'metricMm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(km|kilometer)\b', 'metricKm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(m|meter)\b', 'metricM', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cell|cells)\b', 'metricCell', text)
    text = re.sub(r'\b[\d\.\/]+\s?(lb|lbs|pound)\b', 'metricLb', text)
    text = re.sub(r'\b[\d\.\/]+\s?(yds|yd|yard|yards)\b', 'metricYd', text)
    text = re.sub(r'\b[\d\.\/]+\s?(pc|pcs|pieces|piece)\b', 'metricPc', text)
    text = re.sub(r'\b[\d\.\/]+\s?(gal|gals|gallon|gallons)\b', 'metricGal', text)
    text = re.sub(r'\b[\d\.\/]+\s?(yd|yds|yard|yards)\b', 'metricYd', text)
    text = re.sub(r'\b[\d\.\/]+\s?(deg|degs|degree|degrees)\b', 'metricDeg', text)
    text = re.sub(r'\b[\d\.\/]+\s?\°', 'metricDeg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(l|liter|liters)\b', 'metricL', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ml|mls|milliliter|milliliters)\b', 'metricMl', text)
    text = re.sub(r'\b[\d\.\/]+\s?(kg|kilograms|kilogram)\b', 'metricKg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(g|grams|gram)\b', 'metricG', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mg|mgs|milligrams|milligram)\b', 'metricMg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(sq|sqs|square|squares)\b', 'metricSq', text)
    text = re.sub(r'\b[\d\.\/]+\s?(pt|pts|pint|pints)\b', 'metricPt', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ohm)\b', 'metricOhm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(fz)\b', 'metricFz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ct|cts|carat|carats)\b', 'metricCt', text)
    text = re.sub(r'\b[\d]+p\b', 'metricRes', text)
    text = re.sub(r'\b[\d]+x[\d]+\b', 'metricRes', text)
    text = re.sub(r'\b[\d]+x\b', 'metricX', text)
    return text

def clean_text_standard_metrics_v0plus(text):
    text = text.lower()
    text = standardize_metrics(text)
    text = re.sub(r'\b\d*\.\d+\b', 'nbDec', text)
    text = re.sub(r'\b\d+\/\d+\b', 'nbFra', text)
    text = re.sub(r'\b\d+\b', 'nbNat', text)
    text = re.sub('\d+', '0', text) 
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

def standardize_char(text):
    text = text.lower()
    text = re.sub(r'\b(\d+\s?x\s?)?\d+(\s|-|\/|\s?per\s?|\s?pc\s?s\/\s?|)pa?cks?\b', 'pck', text)
    text = re.sub(r'\bpa?cks?(\s|-|\sof\s|)\d+\b', 'pck', text)
    text = re.sub(r'\b(\d+\s?x\s?)?\d+(\s|-|\/|\s?per\s?|\s?pc\s?s\/\s?|)sets?\b', 'set', text)
    text = re.sub(r'\bsets?(\s|-|\sof\s|)\d+\b', 'set', text)
    text = re.sub(r'\b[\d\.\/]+\s?(v|volt)\b', 'v', text)
    text = re.sub(r'\b[\d\.\/]+\s?(amp|amps|ampere|amperes)\b', 'a', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mah|ah|ampere-hour)\b', 'ah', text)
    text = re.sub(r'\b[\d\.\/]+\s?(in|inch|inches)\b', 'in', text)
    text = re.sub(r'\b[\d\.\/]+\s?\"', 'in', text)
    text = re.sub(r'\b[\d\.\/]+\s?(gb|gig|go|giga|gigabit|gigabyte)\b', 'gb', text)
    text = re.sub(r'\b[\d\.\/]+\s?(oz|ounce)\b', 'oz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(fl\.? oz\.?|fluids? ounces?)\b', 'floz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cwt|quintal)\b', 'cwt', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mhz|hz|khz|ghz|gigahertz|megahertz|kilohertz|hertz)\b', 'hz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(wh|kwh|watt-hour|kilowatt-hour)\b', 'wh', text)
    text = re.sub(r'\b[\d\.\/]+\s?(w|kw|watt|kilowatt)\b', 'w', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mf|mfd|mmf|mmfd|microfarad)\b', 'mfd', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ft|feet|foot)\b', 'ft', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cm|centimeter)\b', 'cm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mm|millimeter)\b', 'mm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(km|kilometer)\b', 'km', text)
    text = re.sub(r'\b[\d\.\/]+\s?(m|meter)\b', 'm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cell|cells)\b', 'cell', text)
    text = re.sub(r'\b[\d\.\/]+\s?(lb|lbs|pound)\b', 'lb', text)
    text = re.sub(r'\b[\d\.\/]+\s?(yds|yd|yard|yards)\b', 'yd', text)
    text = re.sub(r'\b[\d\.\/]+\s?(pc|pcs|pieces|piece)\b', 'pcs', text)
    text = re.sub(r'\b[\d\.\/]+\s?(gal|gals|gallon|gallons)\b', 'gal', text)
    text = re.sub(r'\b[\d\.\/]+\s?(yd|yds|yard|yards)\b', 'yd', text)
    text = re.sub(r'\b[\d\.\/]+\s?(deg|degs|degree|degrees)\b', 'deg', text)
    text = re.sub(r'\b[\d\.\/]+\s?\°', 'deg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(l|liter|liters)\b', 'l', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ml|mls|milliliter|milliliters)\b', 'ml', text)
    text = re.sub(r'\b[\d\.\/]+\s?(kg|kilograms|kilogram)\b', 'kg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(g|grams|gram)\b', 'g', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mg|mgs|milligrams|milligram)\b', 'mg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(sq|sqs|square|squares)\b', 'sg', text)
    text = re.sub(r'\b[\d\.\/]+\s?(pt|pts|pint|pints)\b', 'pt', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ohm)\b', 'ohm', text)
    text = re.sub(r'\b[\d\.\/]+\s?(fz)\b', 'fz', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ct|cts|carat|carats)\b', 'ct', text)
    text = re.sub(r'\b[\d]+p\b', 'res', text)
    text = re.sub(r'\b[\d]+x[\d]+\b', 'res', text)
    text = re.sub(r'\b[\d]+x\b', 'x', text)
    text = re.sub(r'\b\d*\.\d+\b', 'deci', text)
    text = re.sub(r'\b\d+\/\d+\b', 'frac', text)
    text = re.sub(r'\b\d+\b', 'nat', text)
    text = re.sub(r'\b(\w*\d\w*[a-z]+\w*|\w*[a-z]+\w*\d\w*)\b', 'sku', text)
    text = re.sub('\d+', '0', text) 
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

def xtrem_clean_char(text):
    text = text.lower()
    text = re.sub(r'\b(\d+\s?x\s?)?\d+(\s|-|\/|\s?per\s?|\s?pc\s?s\/\s?|)pa?cks?\b', '', text)
    text = re.sub(r'\bpa?cks?(\s|-|\sof\s|)\d+\b', '', text)
    text = re.sub(r'\b(\d+\s?x\s?)?\d+(\s|-|\/|\s?per\s?|\s?pc\s?s\/\s?|)sets?\b', '', text)
    text = re.sub(r'\bsets?(\s|-|\sof\s|)\d+\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(v|volt)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(amp|amps|ampere|amperes)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mah|ah|ampere-hour)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(in|inch|inches)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?\"', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(gb|gig|go|giga|gigabit|gigabyte)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(oz|ounce)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(fl\.? oz\.?|fluids? ounces?)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cwt|quintal)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mhz|hz|khz|ghz|gigahertz|megahertz|kilohertz|hertz)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(wh|kwh|watt-hour|kilowatt-hour)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(w|kw|watt|kilowatt)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mf|mfd|mmf|mmfd|microfarad)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ft|feet|foot)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cm|centimeter)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mm|millimeter)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(km|kilometer)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(m|meter)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(cell|cells)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(lb|lbs|pound)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(yds|yd|yard|yards)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(pc|pcs|pieces|piece)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(gal|gals|gallon|gallons)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(yd|yds|yard|yards)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(deg|degs|degree|degrees)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?\°', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(l|liter|liters)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ml|mls|milliliter|milliliters)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(kg|kilograms|kilogram)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(g|grams|gram)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(mg|mgs|milligrams|milligram)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(sq|sqs|square|squares)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(pt|pts|pint|pints)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ohm)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(fz)\b', '', text)
    text = re.sub(r'\b[\d\.\/]+\s?(ct|cts|carat|carats)\b', '', text)
    text = re.sub(r'\b[\d]+p\b', '', text)
    text = re.sub(r'\b[\d]+x[\d]+\b', '', text)
    text = re.sub(r'\b[\d]+x\b', '', text)
    text = re.sub(r'\b\d*\.\d+\b', '', text)
    text = re.sub(r'\b\d+\/\d+\b', '', text)
    text = re.sub(r'\b\d+\b', '', text)
    text = re.sub(r'\b(\w*\d\w*[a-z]+\w*|\w*[a-z]+\w*\d\w*)\b', '', text)
    text = re.sub('\d+', '', text) 
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text



In [6]:
#TEST 26: lemmatization + clean_text_standard_metrics_v0plus preprocessor + sublinear + single letter + min_df 2
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
clf = Pipeline([
    ('vectorizer', TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=0.9, norm='l2', ngram_range=(1, 2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,
                                stop_words=single_letter+list(string.punctuation))),
    ('classifier', LinearSVC(verbose=1))
])

print("start train...")
clf=train_test(clf, X, y)
print("finished train")

start train...
X_train:
600000
X_test:
200000
y_train:
600000
y_test:
200000
[LibLinear]

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Accuracy: 0.817935
Precision: 0.8095020763184998
Recall: 0.817935
F1-Score: 0.8100457644640601
finished train


In [None]:
#TEST 28b: char n gram
clf = Pipeline([
    ('vectorizer', TfidfVectorizer(sublinear_tf=True, min_df=2,norm='l2', ngram_range=(1,4),lowercase=True, analyzer='char',strip_accents='unicode')),
    ('classifier', LinearSVC(verbose=1))
])

print("start train...")
clf=train_test(clf, X, y)
print("finished train")

start train...
X_train:
600000
X_test:
200000
y_train:
600000
y_test:
200000
[LibLinear]Accuracy: 0.820155
Precision: 0.8108338638615838
Recall: 0.820155
F1-Score: 0.8108139486508633
finished train


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

In [7]:
char_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2,norm='l2', ngram_range=(1,4),lowercase=True, analyzer='char',strip_accents='unicode')
char_tfidf.fit(X_train)
len(char_tfidf.get_feature_names())

449377

In [None]:
char_lsa = Pipeline([
    ('vectorizer', char_tfidf),
    ('reducer', TruncatedSVD(n_components=1000)),
    ('classifier', LinearSVC(verbose=1))
])

print("start train...")
char_lsa=train_test(char_lsa, X, y)
print("finished train")

start train...
X_train:
600000
X_test:
200000
y_train:
600000
y_test:
200000
[LibLinear]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

In [7]:
char_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2,norm='l2', ngram_range=(1,4),lowercase=True, analyzer='char',strip_accents='unicode')
char_tfidf.fit(X_train)
len(char_tfidf.get_feature_names())

449377

In [8]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=0.9, norm='l2', ngram_range=(1, 2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation))
word_tfidf.fit(X_train)
len(word_tfidf.get_feature_names())

441010

In [7]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,max_df=0.9, norm='l2', ngram_range=(3,3), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation))
word_tfidf.fit(X_train)
len(word_tfidf.get_feature_names())

111697

In [19]:
char_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2,norm='l2', ngram_range=(1,4),lowercase=True, analyzer='char',strip_accents='unicode')
char_tfidf.fit(X_train)
len(char_tfidf.get_feature_names())

449377

In [13]:
X_train_tfidf_char = char_tfidf.transform(X_train)
X_train_tfidf_word = word_tfidf.transform(X_train)

X_test_tfidf_char = char_tfidf.transform(X_test)
X_test_tfidf_word = word_tfidf.transform(X_test)

In [14]:
X_train_tfidf_combined = hstack((X_train_tfidf_char, X_train_tfidf_word))
X_test_tfidf_combined = hstack((X_test_tfidf_char, X_test_tfidf_word))

In [15]:
clf = LinearSVC(verbose=1)
clf.fit(X_train_tfidf_combined, y_train)

y_pred = clf.predict(X_test_tfidf_combined)
weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(y_test,y_pred, pos_label=None, average='weighted')
print "Accuracy: %s" % clf.score(X_test_tfidf_combined, y_test)
print "Precision: %s" % weighted_p
print "Recall: %s" % weighted_r
print "F1-Score: %s" % weighted_f1

[LibLinear]Accuracy: 0.81949
Precision: 0.8112334240723831
Recall: 0.81949
F1-Score: 0.8114651490843027


In [16]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=4,max_df=0.9, norm='l2', ngram_range=(3,3), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation))
word_tfidf.fit(X_train)
len(word_tfidf.get_feature_names())

150982

In [17]:
X_train_tfidf_char = char_tfidf.transform(X_train)
X_train_tfidf_word = word_tfidf.transform(X_train)
X_test_tfidf_char = char_tfidf.transform(X_test)
X_test_tfidf_word = word_tfidf.transform(X_test)
X_train_tfidf_combined = hstack((X_train_tfidf_char, X_train_tfidf_word))
X_test_tfidf_combined = hstack((X_test_tfidf_char, X_test_tfidf_word))

In [18]:
clf = LinearSVC(verbose=1)
clf.fit(X_train_tfidf_combined, y_train)

y_pred = clf.predict(X_test_tfidf_combined)
weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(y_test,y_pred, pos_label=None, average='weighted')
print "Accuracy: %s" % clf.score(X_test_tfidf_combined, y_test)
print "Precision: %s" % weighted_p
print "Recall: %s" % weighted_r
print "F1-Score: %s" % weighted_f1

[LibLinear]Accuracy: 0.819045
Precision: 0.8111303551430668
Recall: 0.819045
F1-Score: 0.81123674139736


In [22]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3,max_df=0.9, norm='l2', ngram_range=(3,3), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation))
word_tfidf.fit(X_train)
len(word_tfidf.get_feature_names())

228091

In [23]:
X_train_tfidf_char = char_tfidf.transform(X_train)
X_train_tfidf_word = word_tfidf.transform(X_train)
X_test_tfidf_char = char_tfidf.transform(X_test)
X_test_tfidf_word = word_tfidf.transform(X_test)
X_train_tfidf_combined = hstack((X_train_tfidf_char, X_train_tfidf_word))
X_test_tfidf_combined = hstack((X_test_tfidf_char, X_test_tfidf_word))

In [24]:
clf = LinearSVC(verbose=1)
clf.fit(X_train_tfidf_combined, y_train)

y_pred = clf.predict(X_test_tfidf_combined)
weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(y_test,y_pred, pos_label=None, average='weighted')
print "Accuracy: %s" % clf.score(X_test_tfidf_combined, y_test)
print "Precision: %s" % weighted_p
print "Recall: %s" % weighted_r
print "F1-Score: %s" % weighted_f1

[LibLinear]Accuracy: 0.81891
Precision: 0.8112140665712793
Recall: 0.81891
F1-Score: 0.8113070724856329


In [29]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3,max_df=0.9, norm='l2', ngram_range=(3,3), preprocessor=xtrem_clean_char,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation))
word_tfidf.fit(X_train)
len(word_tfidf.get_feature_names())

161834

In [30]:
X_train_tfidf_char = char_tfidf.transform(X_train)
X_train_tfidf_word = word_tfidf.transform(X_train)
X_test_tfidf_char = char_tfidf.transform(X_test)
X_test_tfidf_word = word_tfidf.transform(X_test)
X_train_tfidf_combined = hstack((X_train_tfidf_char, X_train_tfidf_word))
X_test_tfidf_combined = hstack((X_test_tfidf_char, X_test_tfidf_word))

In [33]:
clf = LinearSVC(verbose=1)
clf.fit(X_train_tfidf_combined, y_train)

y_pred = clf.predict(X_test_tfidf_combined)
weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(y_test,y_pred, pos_label=None, average='weighted')
print("Accuracy:",accuracy_score(y_pred, y_test))
print("Precision:", weighted_p)
print("Recall:", weighted_r)
print("F1-Score:", weighted_f1)

[LibLinear]Accuracy: 0.818535
Precision: 0.8102391684889123
Recall: 0.818535
F1-Score: 0.8104876369118739


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [28]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,max_df=0.9, norm='l2', ngram_range=(2,2), preprocessor=xtrem_clean_char,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation))
word_tfidf.fit(X_train)
len(word_tfidf.get_feature_names())

102097

In [29]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=4,max_df=0.9, norm='l2', ngram_range=(2,2), preprocessor=xtrem_clean_char,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation))
word_tfidf.fit(X_train)
len(word_tfidf.get_feature_names())

132037

In [30]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3,max_df=0.9, norm='l2', ngram_range=(2,2), preprocessor=xtrem_clean_char,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation))
word_tfidf.fit(X_train)
len(word_tfidf.get_feature_names())

185880

In [31]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=0.9, norm='l2', ngram_range=(2,2), preprocessor=xtrem_clean_char,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation))
word_tfidf.fit(X_train)
len(word_tfidf.get_feature_names())

315651

In [32]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,max_df=0.9, norm='l2', ngram_range=(2,2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation))
word_tfidf.fit(X_train)
len(word_tfidf.get_feature_names())

123370

In [33]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=4,max_df=0.9, norm='l2', ngram_range=(2,2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation))
word_tfidf.fit(X_train)
len(word_tfidf.get_feature_names())

158373

In [34]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3,max_df=0.9, norm='l2', ngram_range=(2,2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation))
word_tfidf.fit(X_train)
len(word_tfidf.get_feature_names())

221679

In [35]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=0.9, norm='l2', ngram_range=(2,2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation))
word_tfidf.fit(X_train)
len(word_tfidf.get_feature_names())

373272

In [None]:
#word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3,max_df=0.9, norm='l2', ngram_range=(2,2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation))

In [35]:
X_train_tfidf_char = char_tfidf.transform(X_train)
X_train_tfidf_word = word_tfidf.transform(X_train)
X_test_tfidf_char = char_tfidf.transform(X_test)
X_test_tfidf_word = word_tfidf.transform(X_test)
X_train_tfidf_combined = hstack((X_train_tfidf_char, X_train_tfidf_word))
X_test_tfidf_combined = hstack((X_test_tfidf_char, X_test_tfidf_word))

In [36]:
clf = LinearSVC(verbose=1)
clf.fit(X_train_tfidf_combined, y_train)

y_pred = clf.predict(X_test_tfidf_combined)
weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(y_test,y_pred, pos_label=None, average='weighted')
print("Accuracy:", accuracy_score(y_pred, y_test))
print("Precision:", weighted_p)
print("Recall:", weighted_r)
print("F1-Score:", weighted_f1)

[LibLinear]Accuracy: 0.822455
Precision: 0.8157397291306441
Recall: 0.822455
F1-Score: 0.8159350481617358


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [8]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=0.9, norm='l2', ngram_range=(2,2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation))
word_tfidf.fit(X_train)
len(word_tfidf.get_feature_names())

373330

In [9]:
X_train_tfidf_char = char_tfidf.transform(X_train)
X_train_tfidf_word = word_tfidf.transform(X_train)
X_test_tfidf_char = char_tfidf.transform(X_test)
X_test_tfidf_word = word_tfidf.transform(X_test)
X_train_tfidf_combined = hstack((X_train_tfidf_char, X_train_tfidf_word))
X_test_tfidf_combined = hstack((X_test_tfidf_char, X_test_tfidf_word))

In [None]:
clf = LinearSVC(verbose=1)
clf.fit(X_train_tfidf_combined, y_train)

y_pred = clf.predict(X_test_tfidf_combined)
weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(y_test,y_pred, pos_label=None, average='weighted')
print("Accuracy:", accuracy_score(y_pred, y_test))
print("Precision:", weighted_p)
print("Recall:", weighted_r)
print("F1-Score:", weighted_f1)

[LibLinear]

In [8]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3,max_df=0.9, norm='l2', ngram_range=(1,2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation))
word_tfidf.fit(X_train)
len(word_tfidf.get_feature_names())

269971

In [9]:
X_train_tfidf_char = char_tfidf.transform(X_train)
X_train_tfidf_word = word_tfidf.transform(X_train)
X_test_tfidf_char = char_tfidf.transform(X_test)
X_test_tfidf_word = word_tfidf.transform(X_test)
X_train_tfidf_combined = hstack((X_train_tfidf_char, X_train_tfidf_word))
X_test_tfidf_combined = hstack((X_test_tfidf_char, X_test_tfidf_word))

In [None]:
clf = LinearSVC(verbose=1)
clf.fit(X_train_tfidf_combined, y_train)

y_pred = clf.predict(X_test_tfidf_combined)
weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(y_test,y_pred, pos_label=None, average='weighted')
print("Accuracy:", accuracy_score(y_pred, y_test))
print("Precision:", weighted_p)
print("Recall:", weighted_r)
print("F1-Score:", weighted_f1)

[LibLinear]

In [15]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3,max_df=0.9, norm='l2', ngram_range=(1,2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=stopwords.words('english')+single_letter+list(string.punctuation))
word_tfidf.fit(X_train)
len(word_tfidf.get_feature_names())

262116

In [16]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3,max_df=0.9, norm='l2', ngram_range=(1,2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation))
word_tfidf.fit(X_train)
len(word_tfidf.get_feature_names())

269971

In [20]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=0.9, norm='l2', ngram_range=(1,1), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation))
word_tfidf.fit(X_train)
len(word_tfidf.get_feature_names())

67796

In [21]:
X_train_tfidf_char = char_tfidf.transform(X_train)
X_train_tfidf_word = word_tfidf.transform(X_train)
X_test_tfidf_char = char_tfidf.transform(X_test)
X_test_tfidf_word = word_tfidf.transform(X_test)
X_train_tfidf_combined = hstack((X_train_tfidf_char, X_train_tfidf_word))
X_test_tfidf_combined = hstack((X_test_tfidf_char, X_test_tfidf_word))

In [22]:
clf = LinearSVC(verbose=1)
clf.fit(X_train_tfidf_combined, y_train)

y_pred = clf.predict(X_test_tfidf_combined)
weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(y_test,y_pred, pos_label=None, average='weighted')
print("Accuracy:", accuracy_score(y_pred, y_test))
print("Precision:", weighted_p)
print("Recall:", weighted_r)
print("F1-Score:", weighted_f1)

[LibLinear]Accuracy: 0.822295
Precision: 0.8148492737747316
Recall: 0.822295
F1-Score: 0.815275452910897


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [23]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3,max_df=0.9, norm='l2', ngram_range=(1,2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=stopwords.words('english')+single_letter+list(string.punctuation))
word_tfidf.fit(X_train)
len(word_tfidf.get_feature_names())

262116

In [24]:
X_train_tfidf_char = char_tfidf.transform(X_train)
X_train_tfidf_word = word_tfidf.transform(X_train)
X_test_tfidf_char = char_tfidf.transform(X_test)
X_test_tfidf_word = word_tfidf.transform(X_test)
X_train_tfidf_combined = hstack((X_train_tfidf_char, X_train_tfidf_word))
X_test_tfidf_combined = hstack((X_test_tfidf_char, X_test_tfidf_word))

In [25]:
clf = LinearSVC(verbose=1)
clf.fit(X_train_tfidf_combined, y_train)

y_pred = clf.predict(X_test_tfidf_combined)
weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(y_test,y_pred, pos_label=None, average='weighted')
print("Accuracy:", accuracy_score(y_pred, y_test))
print("Precision:", weighted_p)
print("Recall:", weighted_r)
print("F1-Score:", weighted_f1)

[LibLinear]Accuracy: 0.82849
Precision: 0.8217932982054609
Recall: 0.82849
F1-Score: 0.8220414566191109


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [26]:
char_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2,norm='l2', ngram_range=(1,4),lowercase=True, analyzer='char',preprocessor=xtrem_clean_char,strip_accents='unicode')
char_tfidf.fit(X_train)
len(char_tfidf.get_feature_names())

95626

In [27]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3,max_df=0.9, norm='l2', ngram_range=(1,2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation))
word_tfidf.fit(X_train)
len(word_tfidf.get_feature_names())

269971

In [28]:
X_train_tfidf_char = char_tfidf.transform(X_train)
X_train_tfidf_word = word_tfidf.transform(X_train)
X_test_tfidf_char = char_tfidf.transform(X_test)
X_test_tfidf_word = word_tfidf.transform(X_test)
X_train_tfidf_combined = hstack((X_train_tfidf_char, X_train_tfidf_word))
X_test_tfidf_combined = hstack((X_test_tfidf_char, X_test_tfidf_word))

In [29]:
clf = LinearSVC(verbose=1)
clf.fit(X_train_tfidf_combined, y_train)

y_pred = clf.predict(X_test_tfidf_combined)
weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(y_test,y_pred, pos_label=None, average='weighted')
print("Accuracy:", accuracy_score(y_pred, y_test))
print("Precision:", weighted_p)
print("Recall:", weighted_r)
print("F1-Score:", weighted_f1)

[LibLinear]Accuracy: 0.82056
Precision: 0.813555189615295
Recall: 0.82056
F1-Score: 0.8139858836602456


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [30]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=0.9, norm='l2', ngram_range=(1,2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation))
word_tfidf.fit(X_train)
len(word_tfidf.get_feature_names())

441126

In [31]:
X_train_tfidf_char = char_tfidf.transform(X_train)
X_train_tfidf_word = word_tfidf.transform(X_train)
X_test_tfidf_char = char_tfidf.transform(X_test)
X_test_tfidf_word = word_tfidf.transform(X_test)
X_train_tfidf_combined = hstack((X_train_tfidf_char, X_train_tfidf_word))
X_test_tfidf_combined = hstack((X_test_tfidf_char, X_test_tfidf_word))

In [32]:
clf = LinearSVC(verbose=1)
clf.fit(X_train_tfidf_combined, y_train)

y_pred = clf.predict(X_test_tfidf_combined)
weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(y_test,y_pred, pos_label=None, average='weighted')
print("Accuracy:", accuracy_score(y_pred, y_test))
print("Precision:", weighted_p)
print("Recall:", weighted_r)
print("F1-Score:", weighted_f1)

[LibLinear]Accuracy: 0.82174
Precision: 0.8147924757938853
Recall: 0.82174
F1-Score: 0.8151024930353313


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [10]:
char_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3,norm='l2', ngram_range=(1,4),lowercase=True, analyzer='char',preprocessor=xtrem_clean_char,strip_accents='unicode')
char_tfidf.fit(X_train)
len(char_tfidf.get_feature_names())

84226

In [11]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2,max_df=0.9, norm='l2', ngram_range=(1,2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=single_letter+list(string.punctuation))
word_tfidf.fit(X_train)
len(word_tfidf.get_feature_names())

441126

In [12]:
X_train_tfidf_char = char_tfidf.transform(X_train)
X_train_tfidf_word = word_tfidf.transform(X_train)
X_test_tfidf_char = char_tfidf.transform(X_test)
X_test_tfidf_word = word_tfidf.transform(X_test)
X_train_tfidf_combined = hstack((X_train_tfidf_char, X_train_tfidf_word))
X_test_tfidf_combined = hstack((X_test_tfidf_char, X_test_tfidf_word))

In [13]:
clf = LinearSVC(verbose=1)
clf.fit(X_train_tfidf_combined, y_train)

y_pred = clf.predict(X_test_tfidf_combined)
weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(y_test,y_pred, pos_label=None, average='weighted')
print("Accuracy:", accuracy_score(y_pred, y_test))
print("Precision:", weighted_p)
print("Recall:", weighted_r)
print("F1-Score:", weighted_f1)

[LibLinear]Accuracy: 0.821735
Precision: 0.8148258025181456
Recall: 0.821735
F1-Score: 0.8151270989332956


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [6]:
char_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2,norm='l2', ngram_range=(1,4),lowercase=True, analyzer='char',strip_accents='unicode')
char_tfidf.fit(X)
len(char_tfidf.get_feature_names())

503518

In [7]:
single_letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y']
word_tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3,max_df=0.9, norm='l2', ngram_range=(1,2), preprocessor=clean_text_standard_metrics_v0plus,strip_accents='unicode',tokenizer=lemmatization_tokenizer,stop_words=stopwords.words('english')+single_letter+list(string.punctuation))
word_tfidf.fit(X)
len(word_tfidf.get_feature_names())

330972

In [8]:
X_tfidf_char = char_tfidf.transform(X)
X_tfidf_word = word_tfidf.transform(X)
X_tfidf_combined = hstack((X_tfidf_char, X_tfidf_word))

In [9]:
joblib.dump(word_tfidf, 'word_tfidf.pkl')

['word_tfidf.pkl']

In [10]:
joblib.dump(char_tfidf, 'char_tfidf.pkl')

['char_tfidf.pkl']

[LibLinear]

In [14]:
char_tfidf = joblib.load('char_tfidf.pkl') 

In [15]:
word_tfidf = joblib.load('word_tfidf.pkl') 

In [16]:
X_tfidf_char = char_tfidf.transform(X)
X_tfidf_word = word_tfidf.transform(X)

In [17]:
X_tfidf_char

<800000x503518 sparse matrix of type '<class 'numpy.float64'>'
	with 162424877 stored elements in Compressed Sparse Row format>

In [19]:
X_tfidf_char.toarray()

MemoryError: 

In [21]:
X_tfidf_combined = hstack((X_tfidf_char, X_tfidf_word))

In [22]:
X_tfidf_combined

<800000x834490 sparse matrix of type '<class 'numpy.float64'>'
	with 175872548 stored elements in Compressed Sparse Row format>

In [26]:
X_tfidf_combined.check_format()

In [None]:
scipy.version.full_version

In [None]:
clf = LinearSVC(verbose=1)
clf.fit(X_tfidf_combined, y)

[LibLinear]

In [None]:
joblib.dump(clf, 'rakutenModel.pkl')

In [None]:
clf = joblib.load('rakutenModel.pkl') 

In [None]:
df=pd.read_csv('./rdc-catalog-test.tsv',delimiter='\t',encoding='utf-8')
df.head()
df.values.shape

In [None]:
values=np.hstack(df.values)
predictions=clf.predict(values)

In [None]:
df.insert(1,'CategoryIdPath',[pred for pred in predictions])
df.head()

In [None]:
df.to_csv("submission_test_stdz.tsv", sep='\t', encoding='utf-8',index=False,header=False)
sub_df=pd.read_csv("submission_test_stdz.tsv",delimiter='\t',encoding='utf-8',header=None)
sub_df.head()

In [None]:
from sklearn.metrics import classification_report
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
allStats = precision_recall_fscore_support(y_test, y_pred, average=None)

In [11]:
import collections

In [14]:
c = collections.Counter(df['label'])

In [15]:
c

Counter({u'4238>321>2813>3373': 58,
         u'4015>282>2484': 2231,
         u'2075>854>3035': 22,
         u'4015>3754>3663>1500>4072': 1656,
         u'4238>4048>422>182': 72,
         u'3292>1370>3828>2388>1561': 7,
         u'4238>4048>2857>607': 6,
         u'2296>2435>2801': 154,
         u'4015>3636>1319>1928>3213': 122,
         u'4015>3285>1443>2410': 7,
         u'3625>702>3234>3612': 26,
         u'4015>2824>2964>2100>3338>392': 6,
         u'4015>3636>1319>152>2475>2811': 298,
         u'1608>2320>498>2543': 128,
         u'3292>2375>446>2580': 10,
         u'4015>3285>345>1237>3665': 25,
         u'2075>1724>3017>4474': 348,
         u'1208>310>397>1845>3800': 15,
         u'4238>1104>4623': 8,
         u'2199>661>646>3788': 18,
         u'2075>1724>1552>4412>2916': 24,
         u'4015>4733>1818': 5,
         u'4238>1625>4647>4158>3854': 6,
         u'3292>1370>3828>2388>4293': 449,
         u'1608>4269>3031>3791': 555,
         u'3292>1370>3828>2388>1691': 69,
         u

In [35]:
df[df['label'] == '4015>3754>3663>512>3569'].shape

(40, 2)

In [None]:
df[df['label'] == '4015>3754>3663>1500>2605']