In [26]:
import pandas as pd
from scipy.sparse import hstack, save_npz, csr_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
train_df = pd.read_csv('data/train.txt', sep='\t')
train_df.shape

(10672, 8)

In [5]:
train_df.head(3)

Unnamed: 0,id,title,url,publisher,category,story,hostname,timestamp
0,289974,Draghi Unites Euro Bulls With Bears Watching $...,http://www.businessweek.com/news/2014-06-16/dr...,Businessweek,b,dWFE6j8qiln4-gM9XHXLwrtbu9G3M,www.businessweek.com,1402916284786
1,170077,"A Guide To Spring Gardening, For Allergy-Suffe...",http://www.huffingtonpost.com/2014/04/26/garde...,Huffington Post,m,d9rqLdlVaeG5QeMA7Qa82YUH0FgkM,www.huffingtonpost.com,1398799729255
2,381569,By Odin's beard: Marvel creates a storm of con...,http://www.dailymail.co.uk/news/article-269326...,Daily Mail,e,dUmj0nbZNQbZRjMarG4k-G60NPjyM,www.dailymail.co.uk,1405500950599


In [13]:
hostname_encoder = OneHotEncoder(handle_unknown='ignore').fit(train_df.hostname.unique().reshape(-1, 1))

In [18]:
hostname_encoder.transform(train_df.hostname.values.reshape(-1, 1))

<10672x6 sparse matrix of type '<class 'numpy.float64'>'
	with 10672 stored elements in Compressed Sparse Row format>

In [19]:
publisher_encoder = OneHotEncoder(handle_unknown='ignore').fit(train_df.publisher.unique().reshape(-1, 1))

In [20]:
publisher_encoder.transform(train_df.publisher.values.reshape(-1, 1))

<10672x5 sparse matrix of type '<class 'numpy.float64'>'
	with 10672 stored elements in Compressed Sparse Row format>

In [21]:
title_vectorizer = TfidfVectorizer().fit(train_df.title)

In [22]:
title_vectorizer.transform(train_df.title)

<10672x12858 sparse matrix of type '<class 'numpy.float64'>'
	with 106104 stored elements in Compressed Sparse Row format>

In [31]:
def create_vectors(df: pd.DataFrame) -> csr_matrix:
    vectors = hstack([
        title_vectorizer.transform(df.title),
        publisher_encoder.transform(df.publisher.values.reshape(-1, 1)),
        hostname_encoder.transform(df.hostname.values.reshape(-1, 1))
    ])
    return vectors.tocsr()

In [34]:
valid_df = pd.read_csv('data/valid.txt', sep='\t')
test_df = pd.read_csv('data/test.txt', sep='\t')

In [35]:
save_npz('data/train.feature.txt', create_vectors(train_df))
save_npz('data/valid.feature.txt', create_vectors(valid_df))
save_npz('data/test.feature.txt', create_vectors(test_df))
print('DONE')

DONE
