In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [5]:
import pandas as pd
import nltk as nltk
from sklearn.utils import shuffle, resample
from sklearn.model_selection import train_test_split
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import datetime
from sklearn.preprocessing import StandardScaler
import time
import csv
from joblib import dump, load
import pickle

# ==============================================================================================================
# ===================================Config       ==============================================================
# ==============================================================================================================

NLTK_INITIAL_DOWNLOAD = True
PRODUCE_RESULT = False
RECALCULATE_DATA = True
SAVE_DATA = True
TEST_SIZE = 0.05
FRACTION = 0.1

NLTK_PATH= "/content/drive/My Drive/data/nltk"

ROOT_PATH= "/content/drive/My Drive/data/power_cords"

TRAINING_FILE_PATH= "{}/training.csv".format(ROOT_PATH)
TESTING_FILE_PATH="{}/testing.csv".format(ROOT_PATH)

PRECALCULATED_TRAINING_PATH="{}/precalculated_training.hdf".format(ROOT_PATH)
PRECALCULATED_VALIDATION_PATH="{}/precalculated_validation.hdf".format(ROOT_PATH)

CHOSEN_CLASSIFIER="logistic_regresion"
CUSTOM_CLF_ID="full_set"
SAVED_CLF_FILE="{}/model_{}_{}".format(ROOT_PATH, CHOSEN_CLASSIFIER, CUSTOM_CLF_ID)
SAVED_VECTORIZERS_FILES=SAVED_CLF_FILE + "_{}.vec"

RESULT_DATA_FILE=SAVED_CLF_FILE + "_results.csv"

SAVED_CLF_FILE+=".dump"


PARAMS_COLL = [
    'GL',
    'ASIN_STATIC_ITEM_NAME',
    'ASIN_STATIC_BULLET_POINT',
    'ASIN_STATIC_SUBJECT_KEYWORD',
    'ASIN_STATIC_ITEM_CLASSIFICATION',
    'ASIN_STATIC_ITEM_TYPE_KEYWORD',
    #'ASIN_STATIC_MATERIAL',
    #'ASIN_STATIC_STYLE',
    'ASIN_STATIC_PRODUCT_DESCRIPTION',
    'ASIN_STATIC_BRAND',
    'ASIN_STATIC_BATTERIES_INCLUDED',
    'ASIN_STATIC_BATTERIES_REQUIRED'
]

COLLS_TRANSFORMS = {
        # TODO add tranformers for ASIN_STATIC_BATTERIES_INCLUDED e.g.
    }

TARGET_COL = 'target_label'
ID_COLL = "ID"

# ==============================================================================================================
# ===================================Misc  =====================================================================
# ==============================================================================================================


PANDAS_STORE_KEY = "data"
POST_PROCESSED_COLL_MAP = { }
POST_PROCESSED_COLL = []

for p in PARAMS_COLL:
    p_new_name = p + "_cleared"
    POST_PROCESSED_COLL_MAP[p] = p_new_name
    POST_PROCESSED_COLL.append(p_new_name)

if PRODUCE_RESULT:
    FRACTION = 1.0
    TEST_SIZE = 0

if NLTK_INITIAL_DOWNLOAD:
    nltk.download("wordnet", NLTK_PATH)
    nltk.download('stopwords', NLTK_PATH)
nltk.data.path.append(NLTK_PATH)

class ScopedTimer():
    def __init__(self):
        self.timed = []
        self.last_time = None

    def start(self, name):
        if self.last_time is None:
            self.last_time = time.time()
        else:
            self.timed[-1] += (str(time.time() - self.last_time),)
            self.last_time = time.time()
            print("{}: {}s".format(self.timed[-1][0], self.timed[-1][1]))

        self.timed.append((name,))

    def finish(self):
        if self.last_time is not None:
            self.timed[-1] += (str(time.time() - self.last_time),)

    def print(self):
        print("[Name{}] [time]".format("".ljust((15))))
        for t in self.timed:
            print("{} {}s".format(t[0].ljust(21), t[1]))

# ==============================================================================================================
# ===================================Load data    ==============================================================
# ==============================================================================================================


def get_dataset(filename):
    return pd.read_csv(filename, encoding='utf-8')


def strip_unused_colls(df, used_colls):
    return df[used_colls]


# this one is training data
def shuffle_resample_cut(df, target):
    df_0 = df[df[target] == 0]
    df_1 = df[df[target] == 1]

    df_1 = resample(df_1, replace=True, n_samples=int(len(df_0)))

    df = pd.concat([df_0, df_1], axis=0)
    df = shuffle(df)

    return df.head(int(len(df) * float(FRACTION)))


def standarise_and_transform(df, params_colls, colls_transforms):
    def standarise(x):
        try:
            return "NaN" if str(x) == "nan" else str(x)
        except Exception as e:
            return x.encode("ascii", "ignore")

    for p in params_colls:
        if p in colls_transforms:
            df[p] = df[p].apply(colls_transforms[p])
        else:
            df[p] = df[p].apply(standarise)

    return df


def load_data(filename, used_colls):
    df_raw = get_dataset(filename)
    df_raw = strip_unused_colls(df_raw, used_colls)
    df_raw = standarise_and_transform(df_raw, PARAMS_COLL, COLLS_TRANSFORMS)

    return df_raw


def load_train_and_validation_data():
    scoped_timer.start("Load raw data")

    df_raw = load_data(TRAINING_FILE_PATH, [TARGET_COL] + PARAMS_COLL)
    df_raw = shuffle_resample_cut(df_raw, TARGET_COL)

    df_train, df_validation = train_test_split(df_raw, test_size=TEST_SIZE)

    print("Validation size {}".format(len(df_validation.index)))
    print("Train size {}".format(len(df_train.index)))

    return df_train, df_validation


def print_first_n_rows(filenae,df):
    print("[{}]".format(filenae))
    print(df.head(3))


def load_precalculated_data_from_file():
    scoped_timer.start("Load recalculated data")
    train_df = pd.read_hdf(PRECALCULATED_TRAINING_PATH, PANDAS_STORE_KEY)
    validation_df = pd.read_hdf(PRECALCULATED_VALIDATION_PATH, PANDAS_STORE_KEY)

    print("First 3 rows from recalculated data")
    print_first_n_rows(PRECALCULATED_TRAINING_PATH, train_df)
    print_first_n_rows(PRECALCULATED_VALIDATION_PATH, validation_df)

    print("Validation size {}".format(len(validation_df.index)))
    print("Train size {}".format(len(train_df.index)))

    return train_df, validation_df

# ==============================================================================================================
# ===================================Refine data    ============================================================
# ==============================================================================================================


def pre_clean(raw_html):
    clean_text = raw_html

    if not clean_text:
        return ""

    try:
        clean_text = clean_text.lower()

        replace_with_empty = ['<.*?>', '&#..;', '[0-9]+', '_', '-']
        for s in replace_with_empty:
            cleanr = re.compile(s)
            clean_text = re.sub(cleanr, '', clean_text)
            if not clean_text:
                return ""

        replace_with_space_1 = [
            '%', '\?', '\$', '\*', '&', ';', ':', '\?', '\$', '\*', '\[', '\]', ',', '\^', '\.+', '\\+', '/']
        for s in replace_with_space_1:
            cleanr = re.compile(s)
            clean_text = re.sub(cleanr, ' ', clean_text)
            if not clean_text:
                return ""

    except:
        return ""

    return clean_text


def post_clean(txt):
    cleantext = txt

    if not cleantext:
        return ""

    try:
        replace_with_empty = ["'"]
        for s in replace_with_empty:
            cleanr = re.compile(s)
            cleantext = re.sub(cleanr, '', cleantext)
            if not cleantext:
                return ""

    except:
        return ""

    return cleantext


def preprocess_naive(df, params_colls):
    scoped_timer.start("Refine data")
    unwanted_words = sorted(
        [u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u"you're", u"you've", u"you'll",
         u"you'd", u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u"she's",
         u'her', u'hers', u'herself', u'it', u"it's", u'its', u'itself', u'they', u'them', u'their', u'theirs',
         u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u"that'll", u'these', u'those', u'am',
         u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does',
         u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while',
         u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during',
         u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over',
         u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all',
         u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'nor', u'only', u'own',
         u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', "that's", "i'd", "it's",
         "i'll", "i'm", "i've", "", 'against', 'not', 'don', "don't", 'ain', 'aren', "aren't", 'couldn', "couldn't",
         'didn', "didn't",
         'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't",
         'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shouldn', "shouldn't", 'wasn',
         "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"])

    def stripper(v):
        try:
            return ''.join([*filter(str.isalnum, v)])
        except:
            # print(v)
            return ""

    def refine(x):
        return [post_clean(stemmer.stem(word)) for word in
                    [word for word in
                        [lemmatizer.lemmatize(word) for word in
                            [stripper(word) for word in tokenizer.tokenize(
                                pre_clean(x)
                            )]
                        ]
                    if (word not in unwanted_words) and (len(word) > 2)]
                ]

    tokenizer = WhitespaceTokenizer()
    lemmatizer = WordNetLemmatizer()
    stemmer = SnowballStemmer('english')

    for p in params_colls:
        df[POST_PROCESSED_COLL_MAP[p]] = df[p].apply(refine)

        print("Coll '{}' refined ".format(p).ljust(80, "="))
        print(df[p].head(3))
        print(df[POST_PROCESSED_COLL_MAP[p]].head(3))
        del df[p]

    return df


def convert_to_data(df, params_colls):
    converted_data = {}
    for p in params_colls:
        converted_data[p] = []

    for _, row in df.iterrows():
        for p in params_colls:
            converted_data[p].append(' '.join(row[POST_PROCESSED_COLL_MAP[p]]))

    return converted_data


def save_to_hdf(df, filename):
    df.to_hdf(filename, PANDAS_STORE_KEY, mode='w')


def get_learning_data():
    if RECALCULATE_DATA:
        train_df, validation_df = load_train_and_validation_data()
        train_df = preprocess_naive(train_df, PARAMS_COLL)
        validation_df = preprocess_naive(validation_df, PARAMS_COLL)

        if SAVE_DATA:
            print("First 3 rows from saved data")
            print_first_n_rows(PRECALCULATED_TRAINING_PATH, train_df)
            print_first_n_rows(PRECALCULATED_VALIDATION_PATH, validation_df)

            save_to_hdf(train_df, PRECALCULATED_TRAINING_PATH)
            save_to_hdf(validation_df, PRECALCULATED_VALIDATION_PATH)

        train_labels = [int(row[TARGET_COL]) for _, row in train_df.iterrows()]
        validation_labels = [int(row[TARGET_COL]) for _, row in validation_df.iterrows()]

        return train_labels, \
               validation_labels, \
               convert_to_data(train_df, PARAMS_COLL), \
               convert_to_data(validation_df, PARAMS_COLL)
    else:
        l_train_df, l_validation_df = load_precalculated_data_from_file()
        return l_train_df,\
               l_validation_df,\
               convert_to_data(l_train_df, PARAMS_COLL),\
               convert_to_data(l_validation_df, PARAMS_COLL)


def get_testing_data():
    test_df = load_data(TESTING_FILE_PATH, [ID_COLL] + PARAMS_COLL)

    print("First 3 rows from saved data")
    print_first_n_rows(TESTING_FILE_PATH, test_df)
    test_df = preprocess_naive(test_df, PARAMS_COLL)
    return test_df, convert_to_data(test_df, PARAMS_COLL)


# ==============================================================================================================
# ===================================Create classifier==========================================================
# ==============================================================================================================


def create_vectorizers(data, params_colls, custom_vectorizers):
    vectorizers = {}
    for col in params_colls:
        if col in custom_vectorizers:
            vectorizers[col] = custom_vectorizers[col]
        else:
            vectorizers[col] = TfidfVectorizer(max_features=500, use_idf=False, ngram_range=(1, 2))

        print("{} = {}".format(col, data[col][:3]))
        vectorizers[col].fit(data[col])

        with open(SAVED_VECTORIZERS_FILES.format(col), 'wb') as fin:
            pickle.dump(vectorizers[col], fin)

    return vectorizers


def load_vectorizers(params_colls):
    vectorizers = {}
    for col in params_colls:
        with open(SAVED_VECTORIZERS_FILES.format(col), 'rb') as fin:
            vectorizers[col] = pickle.load(fin)

    return vectorizers


def create_feed(input_data, params_colls, vectorizers):
    features = []

    for col in params_colls:
        transformed = vectorizers[col].transform(input_data[col]).toarray()
        # save memory
        del input_data[col]
        scaler = StandardScaler()
        scaler.fit(transformed)
        features.append(scaler.transform(transformed))

    res = np.column_stack(features)
    return res


def get_classifier():
    ClASSIFIERS = {
        'SVM': (lambda: GridSearchCV(
            SVC(), [{
                'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]
            },
                {
                    'kernel': ['linear'], 'C': [1, 10, 100, 1000]
                }],
            cv=5,
            scoring='%s_macro' % 'precision')
                ),

        'logistic_regresion': (lambda: GridSearchCV(
            LogisticRegression(
                class_weight='balanced',
                solver='lbfgs'),
            {'penalty': ['l2'], 'C': [0.5, 1, 1.5, 2, 2.5]},
            cv=10)
                               )
    }

    return ClASSIFIERS[CHOSEN_CLASSIFIER]()


# ==============================================================================================================
# ===================================Train & Validate===========================================================
# ==============================================================================================================


def train_and_validate():
    print("Load data")
    train_labels, validation_labels, training_data, validation_data = get_learning_data()

    scoped_timer.start("Create vectorizers")

    custom_vectorizers = {
        'ASIN_STATIC_PRODUCT_DESCRIPTION': TfidfVectorizer(max_features=5000, use_idf=False, ngram_range=(1, 2))
    }

    vectorizers = create_vectorizers(training_data, PARAMS_COLL, custom_vectorizers)
    training_features = create_feed(training_data, PARAMS_COLL, vectorizers)

    # to save memory
    training_data = None

    print('train labels count: ' + str(len(train_labels)))

    clf = get_classifier()

    scoped_timer.start("Train classifier")
    clf.fit(training_features, train_labels)

    scoped_timer.start("Dump classifier")
    dump(clf, SAVED_CLF_FILE)

    scoped_timer.start("Create validation data")

    validation_feed = create_feed(validation_data, PARAMS_COLL, vectorizers)

    scoped_timer.start("Predict on validation data")
    predicted_validation_labels = clf.predict(validation_feed)

    conf_matrix = confusion_matrix(validation_labels, predicted_validation_labels)
    tn, fp, fn, tp = conf_matrix.ravel()

    precision = float(tp / float(float(tp) + float(fp)))
    accuracy = (float(tp) + float(tn)) / float(len(validation_labels))

    print("[Validation confusion matrix]")
    print(conf_matrix)
    print('tp:' + str(tp) + '\nfp:' + str(fp))
    print('Precision ' + str(precision))
    print('Accuracy ' + str(accuracy))

    print('Accuraccy score ' + str(accuracy_score(validation_labels, predicted_validation_labels)))


# ==============================================================================================================
# ===================================Produce results============================================================
# ==============================================================================================================

def predict_results():
    scoped_timer.start("Load classifier")
    clf = load(SAVED_CLF_FILE)

    scoped_timer.start("Load test data")
    test_df, test_data = get_testing_data()

    scoped_timer.start("Load vectorizers")
    vectorizers = load_vectorizers(PARAMS_COLL)

    testing_feed = create_feed(test_data, PARAMS_COLL, vectorizers)

    scoped_timer.start("Predict on testing data")
    predicted_testing_labels = clf.predict(testing_feed)

    ids = []
    for idx, row in test_df.iterrows():
        ids.append(row['ID'])

    result_data = []

    for r_label, r_id in zip(predicted_testing_labels, ids):
        result_data.append([str(r_id), str(r_label)])

    result_df = pd.DataFrame(result_data, columns=['ID', 'model-score'])
    result_df['ID'] = result_df['ID'].astype('str')
    result_df['model-score'] = result_df['model-score'].astype('str')

    result_df.to_csv(RESULT_DATA_FILE, index=False)

    print("Results exported " + str(datetime.datetime.now()))


scoped_timer = ScopedTimer()
if PRODUCE_RESULT:
    predict_results()
else:
    train_and_validate()
scoped_timer.finish()
scoped_timer.print()




[nltk_data] Downloading package wordnet to /content/drive/My
[nltk_data]     Drive/data/nltk...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /content/drive/My
[nltk_data]     Drive/data/nltk...
[nltk_data]   Package stopwords is already up-to-date!
Load data
Validation size 2046
Train size 38863
Load raw data: 5.4545981884002686s
59499        gl_pet_products
22344                  gl_pc
180668    gl_lawn_and_garden
Name: GL, dtype: object
59499        [glpetproduct]
22344                [glpc]
180668    [gllawnandgarden]
Name: GL_cleared, dtype: object
59499         DENNERLE Spare Part LED Power Supply Unit 5.0
22344     Replacement battery for 5110, 6110, 6150, 7110...
180668       Real Life XL Boxer Dog Garden Ornament (SizeA)
Name: ASIN_STATIC_ITEM_NAME, dtype: object
59499      [dennerl, spare, part, led, power, suppli, unit]
22344                                     [replac, batteri]
180668    [real, life, boxer, dog, garden, or

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['GL_cleared', 'ASIN_STATIC_ITEM_NAME_cleared', 'ASIN_STATIC_BULLET_POINT_cleared', 'ASIN_STATIC_SUBJECT_KEYWORD_cleared', 'ASIN_STATIC_ITEM_CLASSIFICATION_cleared', 'ASIN_STATIC_ITEM_TYPE_KEYWORD_cleared', 'ASIN_STATIC_PRODUCT_DESCRIPTION_cleared', 'ASIN_STATIC_BRAND_cleared', 'ASIN_STATIC_BATTERIES_INCLUDED_cleared', 'ASIN_STATIC_BATTERIES_REQUIRED_cleared']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


Refine data: 25.882829666137695s
GL = ['glpetproduct', 'glpc', 'gllawnandgarden']
ASIN_STATIC_ITEM_NAME = ['dennerl spare part led power suppli unit', 'replac batteri', 'real life boxer dog garden ornament sizea']
ASIN_STATIC_BULLET_POINT = ['spare part power suppli unit nano power led led power suppli unit spare part', 'capac mah type liion volt size certif iso roh', 'frost resist made durabl materi hand finish extremeley realist life like eye high collect']
ASIN_STATIC_SUBJECT_KEYWORD = ['nano light', 'nan', 'real life reallif reel life reellif vivdart vivid art bone box boxer cruft dog fur garden guard hound lifelik pedigre pooch resin statu']
ASIN_STATIC_ITEM_CLASSIFICATION = ['baseproduct', 'baseproduct', 'baseproduct']
ASIN_STATIC_ITEM_TYPE_KEYWORD = ['nan', 'nan', 'outdoorliv']
ASIN_STATIC_PRODUCT_DESCRIPTION = ['nan', 'standard replac batteri product offer ideal solut exist batteri develop fault need spare period heavi usag would drain first batteri use finest qualiti batteri c



Train classifier: 2521.8138105869293s
Dump classifier: 0.012123823165893555s
Create validation data: 1.1949987411499023s
[Validation confusion matrix]
[[777 281]
 [190 798]]
tp:798
fp:281
Precision 0.7395736793327155
Accuracy 0.7697947214076246
Accuraccy score 0.7697947214076246
[Name               ] [time]
Load raw data         5.4545981884002686s
Refine data           122.39322805404663s
Refine data           25.882829666137695s
Create vectorizers    35.043055057525635s
Train classifier      2521.8138105869293s
Dump classifier       0.012123823165893555s
Create validation data 1.1949987411499023s
Predict on validation data 0.372112512588501s
