In [1]:
from __future__ import division

import json
import base64
import random
from collections import Counter
import os.path
import imp
import gzip

import sys
import csv
import gc

import numpy as np
import pandas as pd


# %matplotlib notebook
%matplotlib inline
import matplotlib.pyplot as plt


import logging
reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%H:%M:%S')

In [2]:
def convert2unicode(f):
    def tmp(text):
        if not isinstance(text, unicode): text = text.decode('utf8')
        return f(text)
    return tmp

def convert2lower(f):
    def tmp(text):        
        return f(text.lower())
    return tmp

In [3]:
def html2text_bs(raw_html):
    from bs4 import BeautifulSoup
    """
    Тут производится извлечения из html текста
    """
    soup = BeautifulSoup(raw_html, "html.parser")
    
    titles = u''
    for title in soup.find_all('title'):
        titles += title.string + u' ' if title.string else u''
        
    links = u''
    for link in soup.find_all('a'):
        links += link.string + u' ' if link.string else u''
        
    keywords = u''
    for key in soup.find_all("meta", attrs={"name":"keywords"}):
        try:
            keywords += key['content'] + u' ' if key['content'] else u''
        except KeyError:
            pass
        
    [s.extract() for s in soup(['script', 'style'])]
    return soup.get_text(), titles, links, keywords

In [4]:
@convert2lower
@convert2unicode
def easy_tokenizer(text):
    word = unicode()
    for symbol in text:
        if symbol.isalnum(): word += symbol
        elif word:
            yield word
            word = unicode()
    if word: yield word

In [5]:
def html2word(raw_html, to_text=html2text_bs, tokenizer=easy_tokenizer):
    text, title, links, keywords = to_text(raw_html)
    return list(tokenizer(text.lower())), list(tokenizer(title.lower())), \
        list(tokenizer(links.lower())), list(tokenizer(keywords.lower()))

In [6]:
def file2docs_csv(input_file_name, reparse=False):    
    result = []
    
    if os.path.exists(input_file_name + ".json") and not reparse:
        logging.info("File %s.json already exists - load it" % input_file_name)
        result = json.load(open(input_file_name + ".json", 'rb'))
    else:                                
        with gzip.open(input_file_name) if input_file_name.endswith('gz') \
            else open(input_file_name)  as input_file:            
            headers = input_file.readline()
            try:
                for i, line in enumerate(input_file):

                    parts = line.strip().split('\t')
                    url_id = int(parts[0])                                        
                    mark = bool(int(parts[1]))                    
                    url = parts[2]
                    pageInb64 = parts[3]
                    html_data = base64.b64decode(pageInb64)                
                    words, title, links, keywords = html2word(html_data)

                    result.append( (url_id, mark, url, words, title, links, keywords))  
                    if i % 100 == 0: logging.info("Complete %04d" % i)
            except:
                print i, parts                
                raise

                    
        logging.info("Complete %04d" % i)
        logging.info("Create json dump %s" % (input_file_name + ".json"))
        json.dump(result, open(input_file_name + ".json", 'wb'))
    return result

In [7]:
import sys
import zlib

def calc_stats(words, title, links):
    words_str = ''.join(words).encode('utf-8')
    compress_coeff = sys.getsizeof(zlib.compress(words_str)) * 1.0 /sys.getsizeof(words_str)
    
    return [len(words), np.mean(map(len, words)), len(title), len(links), compress_coeff]

In [8]:
def get_n_grams(url, window=5):
    return set(url[i:i+window] for i in range(0, len(url) - window, 1))

In [9]:
from urllib import unquote
from urlparse import urlparse
import re

def parse_url(url):
    url_parsed = urlparse(url)
    parts = [] 
    parts.extend(url_parsed.netloc.split('.'))
    parts.extend(re.split(r'[-_/.]', url_parsed.path))
    return [p for p in parts if p != '']

url = u'http://whistkeruso.narod.ru/prostitutki-shlyuhi-deshevie-moskvi.html'
parse_url(url)

[u'whistkeruso',
 u'narod',
 u'ru',
 u'prostitutki',
 u'shlyuhi',
 u'deshevie',
 u'moskvi',
 u'html']

## Experiments

In [10]:
TRAIN_DATA_FILE  = 'kaggle/kaggle_train_data_tab.csv.gz'
train_docs = file2docs_csv(TRAIN_DATA_FILE, reparse=False)

11:34:37 INFO:File kaggle/kaggle_train_data_tab.csv.gz.json already exists - load it


In [11]:
len(train_docs)

7044

In [12]:
random.shuffle(train_docs)

In [13]:
TEST_DATA_FILE  = 'kaggle/kaggle_test_data_tab.csv.gz'
test_docs = file2docs_csv(TEST_DATA_FILE, reparse=False)

11:34:49 INFO:File kaggle/kaggle_test_data_tab.csv.gz.json already exists - load it


In [14]:
len(test_docs)

16039

In [15]:
random.shuffle(test_docs)

In [16]:
titles      = {'train': [], 'test': []}
urls_parsed = {'train': [], 'test': []}
urls_raw    = {'train': [], 'test': []}
keywords    = {'train': [], 'test': []}

### FEATURE: stats

In [17]:
X_train, Y_train = [], []

for doc in train_docs:
    doc_id, is_spam, url, words_, title, links, keywords_ = doc
    
    title = [w[:5] for w in title]
    keywords_ = [w[:5] for w in keywords_]
    
    X_train.append(calc_stats(words_, title, links))
    
    titles['train'].append(' '.join(title))
    urls_parsed['train'].append(' '.join(parse_url(url)))
    urls_raw['train'].append(url)
    keywords['train'].append(' '.join(keywords_))
    
    Y_train.append(is_spam)
    
X_train = np.asarray(X_train)

In [18]:
doc_ids = []

X_test, Y_test = [], []

for doc in test_docs:
    doc_id, is_spam, url, words_, title, links, keywords_ = doc
    
    title = [w[:5] for w in title]
    keywords_ = [w[:5] for w in keywords_]
    
    X_test.append(calc_stats(words_, title, links))
    
    titles['test'].append(' '.join(title))
    urls_parsed['test'].append(' '.join(parse_url(url)))
    urls_raw['test'].append(url)
    keywords['test'].append(' '.join(keywords_))
    
    doc_ids.append(doc_id)
    
    # Y_test.append(is_spam)
    
X_test = np.asarray(X_test)

### FEATURE: title

In [19]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

vec = TfidfVectorizer()
# vec = CountVectorizer()

feature_train = vec.fit_transform(titles['train'])
feature_test  = vec.transform(titles['test'])

feature_indexes = np.where(feature_train.sum(axis=0) > 10.0)[1]

feature_train = feature_train[:, feature_indexes]
feature_test  = feature_test[:, feature_indexes]

In [20]:
X_train = np.hstack((X_train, feature_train.toarray()))
X_test  = np.hstack((X_test, feature_test.toarray()))

### FEATURE: most frequent words

In [21]:
words_all = []

for doc in train_docs:
    doc_id, is_spam, url, words_, title, links, keywords_ = doc
    words_all.append(' '.join(words_))

for doc in test_docs:
    doc_id, is_spam, url, words_, title, links, keywords_ = doc
    words_all.append(' '.join(words_))

In [22]:
vec = CountVectorizer()

words_all = vec.fit_transform(words_all)
words_all.shape

(23083, 356061)

In [23]:
n = 700.0

words_most_frequent = np.asarray(words_all.sum(axis=0)).reshape(-1)
words_most_frequent = words_most_frequent.argsort()[-int(n):]
words_most_frequent.sort()

feature_train = words_all[:len(train_docs), words_most_frequent]
feature_train = feature_train.sum(axis=1)
feature_train = np.asarray(feature_train)

feature_test = words_all[len(train_docs):, words_most_frequent]
feature_test = feature_test.sum(axis=1)
feature_test = np.asarray(feature_test)

In [24]:
X_train.shape, feature_train.shape

((7044, 230), (7044, 1))

In [25]:
X_train = np.hstack((X_train, feature_train))
X_test  = np.hstack((X_test, feature_test))

In [26]:
feature_train = words_all[:len(train_docs), words_most_frequent]
feature_train /= feature_train
feature_train = np.nan_to_num(feature_train)
feature_train = feature_train.sum(axis=1) / n
feature_train = np.asarray(feature_train)

feature_test = words_all[len(train_docs):, words_most_frequent]
feature_test /= feature_test
feature_test = np.nan_to_num(feature_test)
feature_test = feature_test.sum(axis=1) / n
feature_test = np.asarray(feature_test)

In [27]:
X_train = np.hstack((X_train, feature_train))
X_test  = np.hstack((X_test, feature_test))

In [28]:
vec = TfidfTransformer()

vec.fit(words_all[:, words_most_frequent])

feature_train = vec.transform(words_all[:len(train_docs), words_most_frequent])
feature_test  = vec.transform(words_all[len(train_docs):, words_most_frequent])

In [29]:
X_train = np.hstack((X_train, feature_train.toarray()))
X_test  = np.hstack((X_test, feature_test.toarray()))

In [30]:
del words_all

### FEATURE: ngrams independence

In [31]:
def measure_ngram_independence(ngrams):
    probs = np.asarray(Counter(ngrams).values(), dtype=float) / len(ngrams)
    measure = -np.mean(np.log(probs))
    return measure

In [32]:
feature_train = []
    
for doc in train_docs:
    doc_id, is_spam, url, words_, title, links, keywords_ = doc
    ngr = get_n_grams(' '.join(words_), window=7)
    feature_train.append(measure_ngram_independence(ngr))
    
feature_train = np.asarray(feature_train).reshape(-1, 1)

feature_test  = []

for doc in test_docs:
    doc_id, is_spam, url, words_, title, links, keywords_ = doc
    ngr = get_n_grams(' '.join(words_), window=7)
    feature_test.append(measure_ngram_independence(ngr))

feature_test = np.asarray(feature_test).reshape(-1, 1)

In [33]:
X_train = np.hstack((X_train, feature_train))
X_test  = np.hstack((X_test, feature_test))

### FEATURE: words in urls

In [34]:
vec = CountVectorizer()

feature_train = vec.fit_transform(urls_parsed['train'])
feature_test  = vec.transform(urls_parsed['test'])

feature_indexes = np.where(feature_train.sum(axis=0) > 10.0)[1]

feature_train = feature_train[:, feature_indexes]
feature_test  = feature_test[:, feature_indexes]

In [35]:
X_train = np.hstack((X_train, feature_train.toarray()))
X_test  = np.hstack((X_test, feature_test.toarray()))

### FEATURE: urls ratio

In [36]:
def url_ratio(url):    
    url_parsed = urlparse(url)
    parts = re.split(r'[-_/.]', url_parsed.path)
    
    parts = np.asarray([len(p) for p in parts if p != ''], dtype=float)
    if len(parts) == 0:
        return 0
    return np.mean(parts / len(url))

In [37]:
feature_train = np.asarray(map(url_ratio, urls_raw['train'])).reshape(-1, 1)
feature_test  = np.asarray(map(url_ratio, urls_raw['test'])).reshape(-1, 1)

In [38]:
X_train = np.hstack((X_train, feature_train))
X_test  = np.hstack((X_test, feature_test))

In [39]:
def url_ratio(url):
    url_parsed = urlparse(url)
    parts = re.split(r'[-_/.]', url_parsed.path)
    return len(url_parsed.netloc) / float(len(url))

In [40]:
feature_train = np.asarray(map(url_ratio, urls_raw['train'])).reshape(-1, 1)
feature_test  = np.asarray(map(url_ratio, urls_raw['test'])).reshape(-1, 1)

In [41]:
X_train = np.hstack((X_train, feature_train))
X_test  = np.hstack((X_test, feature_test))

In [42]:
def url_ratio(url):    
    url_parsed = urlparse(url)
    parts = re.split(r'[/]', url_parsed.path)[-1]
    return len(re.split(r'[.-_]', parts))

In [43]:
feature_train = np.asarray(map(url_ratio, urls_raw['train'])).reshape(-1, 1)
feature_test  = np.asarray(map(url_ratio, urls_raw['test'])).reshape(-1, 1)

In [44]:
X_train = np.hstack((X_train, feature_train))
X_test  = np.hstack((X_test, feature_test))

### FEATURE: url

In [45]:
def parse_url_f(url):
    url_parsed = urlparse(url)
    segments = [s for s in url_parsed.path.split('/') if s != '']
    q_params = [q for q in url_parsed.query.split('&') if q != '']
    return segments, q_params


def extract_features_from_url(url):
    segments, q_params = parse_url_f(url)
    features = []

    if q_params:
        features += ['param_name:{}'.format(param.split('=')[0]) for param in q_params]
        features += ['param:{}'.format(param) for param in q_params]

    if len(segments) == 0:
        return features

    features.append('segments:{}'.format(len(segments)))

    categories_templates = [
        'segment_name_{}:{}',             # name of seg
        'segment_[0-9]_{}:1',             # seg consists of digits
        'segment_substr[0-9]_{}:1',       # seg has pattern <str><digits><str>
        'segment_ext_{}:{}',              # seg has an extension
        'segment_ext_substr[0-9]_{}:{}',  # seg has pattern and an extension
        'segment_len_{}:{}',              # length of seg

        'wiki_lines_{}:{}',
        'wiki_underlines_{}:{}',
        'wiki_spaces_{}:{}',
        'wiki_all_spaces_{}:{}',
    ]

    for i, seg in enumerate(segments):
        features.append(categories_templates[0].format(i, seg))

        seg_name, seg_ext = os.path.splitext(seg)
        seg_ext = seg_ext[1:]            # remove dot in extension

        if re.search(r'^(\d)+$', seg_name):
            features.append(categories_templates[1].format(i))

        # pattern_1 = re.search(r'^([^\d]*)(\d+)([^\d]+)$', seg_name)
        # pattern_2 = re.search(r'^([^\d]+)(\d+)([^\d]*)$', seg_name)
        pattern = re.search(r'[^\d]+\d+[^\d]+$', seg_name)

        if pattern:
            features.append(categories_templates[2].format(i))
        if seg_ext:
            features.append(categories_templates[3].format(i, seg_ext))
        if pattern and seg_ext:
            features.append(categories_templates[4].format(i, seg_ext))
        features.append(categories_templates[5].format(i, len(seg)))

        counts_lines = seg_name.count('-')
        if counts_lines:
            features.append(categories_templates[6].format(i, counts_lines))
        
        counts_underlines = seg_name.count('_')
        if counts_underlines:
            features.append(categories_templates[7].format(i, counts_underlines))

        counts_spaces = seg_name.count(' ')
        if counts_spaces:
            features.append(categories_templates[8].format(i, counts_spaces))

        if counts_underlines + counts_spaces:
            features.append(categories_templates[9].format(i, counts_lines + counts_underlines + counts_spaces))

    return features

def choose_features(features, threshold):
    return [f for f in features if f[1] > threshold]


In [46]:
segments_train = map(extract_features_from_url, urls_raw['train'])
segments_test  = map(extract_features_from_url, urls_raw['test'])

alpha = .05
segments = []
[segments.extend(s) for s in segments_train[:1000]]
[segments.extend(s) for s in segments_test[:1000]]

segments = choose_features(Counter(segments), alpha * 2000)

feature_train = np.asarray([[s in seg for s in segments] for seg in segments_train], dtype=int)
feature_test  = np.asarray([[s in seg for s in segments] for seg in segments_test], dtype=int)

In [47]:
del segments_train
del segments_test
del segments

In [48]:
X_train = np.hstack((X_train, feature_train))
X_test  = np.hstack((X_test, feature_test))

### FEATURE: keywords

In [49]:
vec = CountVectorizer()

feature_train = vec.fit_transform(keywords['train'])
feature_test  = vec.transform(keywords['test'])

feature_indexes = np.where(feature_train.sum(axis=0) > 10.0)[1]

feature_train = feature_train[:, feature_indexes]
feature_test  = feature_test[:, feature_indexes]

In [50]:
X_train = np.hstack((X_train, feature_train.toarray()))
X_test  = np.hstack((X_test, feature_test.toarray()))

### PREDICTION

In [51]:
from sklearn.ensemble import GradientBoostingClassifier as GBC

cl = GBC(learning_rate=0.3, max_depth=7)
cl.fit(X_train, Y_train)

GradientBoostingClassifier(init=None, learning_rate=0.3, loss='deviance',
              max_depth=7, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [52]:
Y_pred = cl.predict(X_test)

In [53]:
with open('my_submission.csv', 'wb') as fout:
    writer = csv.writer(fout)
    writer.writerow(['Id','Prediction'])
    for i, prob in enumerate(Y_pred):
        writer.writerow([doc_ids[i], int(prob)])