In [8]:
# %load main.py
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score

In [1]:
import re
from bs4 import BeautifulSoup

def preprocessor(text):
    # remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    # regex for matching emoticons, keep emoticons, ex: :), :-P, :-D
    r = '(?::|;|=|X)(?:-)?(?:\)|\(|D|P)'
    emoticons = re.findall(r, text)
    text = re.sub(r, '', text)
    
    # convert to lowercase and append all emoticons behind (with space in between)
    # replace('-','') removes nose of emoticons
    text = re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-','')
    return text

In [2]:
# %load preprocess.py
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

def tokenizer_stem_nostop(text):
    nltk.download('stopwords', quiet=True)
    stop = stopwords.words('english')
    porter = PorterStemmer()
    return [porter.stem(w) for w in re.split('\s+', text.strip()) \
            if w not in stop and re.match('[a-zA-Z]+', w)]



In [4]:
df = pd.read_csv('./dataset/train.csv')
print(df.head(5))

df_small = df.sample(n=1000, random_state=0)

pipe = Pipeline([('vect', HashingVectorizer(n_features=2**10,
                                             preprocessor=preprocessor, 
                                             tokenizer=tokenizer_stem_nostop)), 
                  ('clf', LogisticRegression(solver = "liblinear"))])
        
scores = cross_val_score(estimator=pipe, X=df_small['Page content'], y=df_small['Popularity'], \
                         cv=10, scoring='roc_auc')
print('LogisticRegression+preprocess+hash: %.3f (+/-%.3f)' % (scores.mean(), scores.std()))

   Id  Popularity                                       Page content
0   0          -1  <html><head><div class="article-info"> <span c...
1   1           1  <html><head><div class="article-info"><span cl...
2   2           1  <html><head><div class="article-info"><span cl...
3   3          -1  <html><head><div class="article-info"><span cl...
4   4          -1  <html><head><div class="article-info"><span cl...
LogisticRegression+preprocess+hash: 0.515 (+/-0.066)


In [5]:
def get_stream(path, size):
    for chunk in pd.read_csv(path, chunksize=size):
        yield chunk

In [12]:
hashvec = HashingVectorizer(n_features=2**20, 
                            preprocessor=preprocessor, tokenizer=tokenizer_stem_nostop)
# loss='log' gives logistic regression
clf = SGDClassifier(loss='log', max_iter=100, tol=1e-3)
batch_size = 1000
stream = get_stream(path='./dataset/train.csv', size=batch_size)
classes = np.array([-1, 1])
train_auc, val_auc = [], []
# we use one batch for training and another for validation in each iteration
# iters = int((50000+batch_size-1)/(batch_size*2))
iters = 2
for i in range(iters):
    batch = next(stream)
    X_train, y_train = batch['Page content'], batch['Popularity']
    if X_train is None:
        break
    X_train = hashvec.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    train_auc.append(roc_auc_score(y_train, clf.predict_proba(X_train)[:,1]))
    
    # validate
    batch = next(stream)
    X_val, y_val = batch['Page content'], batch['Popularity']
    score = roc_auc_score(y_val, clf.predict_proba(hashvec.transform(X_val))[:,1])
    val_auc.append(score)
    print('[{}/{}] {}'.format((i+1)*(batch_size*2), 50000, score))

[2000/50000] 0.49894717458487453
[4000/50000] 0.526117264130901


In [14]:
# import optimized pickle written in C for serializing and 
# de-serializing a Python object
import _pickle as pkl

# dump to disk
pkl.dump(hashvec, open('output/hashvec.pkl', 'wb'))
pkl.dump(clf, open('output/clf-sgd.pkl', 'wb'))

# load from disk
hashvec = pkl.load(open('output/hashvec.pkl', 'rb'))
clf = pkl.load(open('output/clf-sgd.pkl', 'rb'))

df_test = pd.read_csv('./dataset/test.csv')
print('test auc: %.3f' % roc_auc_score(df_test['Popularity'], \
            clf.predict_proba(hashvec.transform(df_test['Page content']))[:,1]))

KeyError: 'Popularity'