In [1]:
from __future__ import division

import base64
import csv
import gzip
import zlib
import nltk
from collections import namedtuple
from bs4 import BeautifulSoup
from bs4.element import Comment
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from tqdm import tqdm
import string
import pickle
import re
import sys

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.linear_model import SGDClassifier
from sklearn import linear_model
from  sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import pandas as pd
import numpy as np


In [17]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]'] or isinstance(element, Comment):
        return False
    else:
        return True

def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    
    links = []
    for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
        links.append(link.get('href'))
    
    return ' '.join(visible_texts), links

def tokenize_me(file_text):
    tokens = nltk.word_tokenize(file_text)
    tokens = [i.lower() for i in tokens if ( i not in string.punctuation )]
 
    stop_words = stopwords.words('russian')
    stop_words.extend(map(lambda x: x.decode('utf8'), 
                          ['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на']))
    tokens = [i for i in tokens if ( i not in stop_words )]     
    return ' '.join(tokens)

In [18]:
DocItem = namedtuple('DocItem', ['doc_id', 'is_spam', 'url', 'html_text'])

def load_csv(input_file_name):    
    with gzip.open(input_file_name) if input_file_name.endswith('gz') else open(input_file_name)  as input_file:
        headers = input_file.readline()
        for i, line in enumerate(input_file):
            parts = line.strip().split('\t')
            url_id = int(parts[0])                                        
            mark = int(parts[1])                    
            url = parts[2]
            pageInb64 = parts[3]
            html_data = base64.b64decode(pageInb64).decode('utf8')            
            yield DocItem(url_id, mark, url, html_data)                   

In [3]:
if load_from_file:
    texts, marks = [], []
    for item in tqdm(load_csv('./antispam-infopoisk/kaggle_train_data_tab.csv.gz')):
        text, links = text_from_html(item.html_text)
        texts.append(tokenize_me(text) + ' '.join(links))
        marks.append(item.is_spam)
    pickle.dump([texts, marks], open('train.p', 'w'))


    test_text, Id = [], []
    for item in tqdm(load_csv('./antispam-infopoisk/kaggle_test_data_tab.csv.gz')):
        text, links = text_from_html(item.html_text)
        test_text.append(tokenize_me(text) + ' ' + ' '.join(links))
        Id.append(item.doc_id)
    pickle.dump((test_text, Id), open('test.p', 'w'))


else:
    texts, marks = pickle.load(open('train.p')) 
    test_text, Id = pickle.load(open('test.p', 'r'))

In [24]:
X_train, X_test, y_train, y_test = train_test_split(texts, marks, test_size=0.1, random_state=42)

## SGD classifier

In [25]:
sgd_text = Pipeline([('tfidf', TfidfVectorizer() ),
                     ('clf', SGDClassifier(verbose=False))])
#sgd_text = clf_text.fit(X_train, y_train)

#y_pred = sgd_text.predict(X_test)
#print f1_score(y_test, y_pred)

In [30]:
min_df_range = np.linspace(0.001, 0.1, 5).round(3)
max_df_range = np.linspace(0.6, 1, 5).round(2)
ngram_range = [(1, 3)]


parameters = {'tfidf__ngram_range': ngram_range,
              'tfidf__min_df' : 0.0,
              'tfidf__max_df' : 1.0,
              'tfidf__use_idf': [True],
             }

In [None]:
gs_clf = GridSearchCV(sgd_text, parameters, n_jobs=-1, verbose=True, scoring='f1', cv=2)

In [None]:
gs_clf.fit(X_train, y_train)

In [97]:
gs_clf.best_params_

{'tfidf__max_df': 1.0,
 'tfidf__min_df': 0.001,
 'tfidf__ngram_range': (1, 2),
 'tfidf__use_idf': True}

In [98]:
y_pred = gs_clf.predict(X_test)
print f1_score(y_test, y_pred)

0.9679124197810494


## Logistic regression

In [10]:
from sklearn.linear_model import LogisticRegression

In [11]:
lr_text = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LogisticRegression(verbose=False))])


In [32]:
lr_clf = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1,3))),
                     ('clf', LogisticRegression(verbose=False))])

In [33]:
lr_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm=u'l2', preprocessor=None, smooth_idf=Tru...l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=False, warm_start=False))])

In [35]:
y_pred = lr_clf.predict(X_test)

In [36]:
f1_score(y_test, y_pred)

0.96

## SGD

In [12]:
min_df_range = np.linspace(0.001, 0.1, 5).round(3)
max_df_range = np.linspace(0.6, 1, 5).round(2)
ngram_range = [(1, 2)]


parameters = {'tfidf__ngram_range': ngram_range,
              'tfidf__min_df' : min_df_range,
              'tfidf__max_df' : max_df_range,
              'tfidf__use_idf': [True],
             }

In [41]:
lr_clf = Pipeline([('tfidf', TfidfVectorizer(max_df=1.0, min_df=0.001, use_idf=True, ngram_range=(1,3)) ),
                     ('clf', SGDClassifier(verbose=False))])


In [42]:
lr_clf.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.001,
        ngram_range=(1, 3), norm=u'l2', preprocessor=None, smooth_idf...', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=False, warm_start=False))])

In [44]:
y_pred = lr_clf.predict(X_test)

In [45]:
f1_score(y_test, y_pred)

0.9763387297633872

In [None]:
clf = Pipeline([('tfidf', TfidfVectorizer()),
                ('clf', linear_model.RandomizedLogisticRegression())])
clf = clf.fit(texts, marks)
prediction = clf.predict(texts)
print np.mean(prediction == marks)

## Write prediction

In [39]:
def write_predict(model, test_text, Id):
    prediction = model.predict(test_text)
    with open('my_submission.csv' , 'wb') as fout:
        writer = csv.writer(fout)
        writer.writerow(['Id','Prediction'])
        for i, item in enumerate(prediction):
            writer.writerow([Id[i], item])

In [46]:
write_predict(lr_clf, test_text, Id)