In [2]:
import json
import pandas as pd

# Let's load the dataset from Renthop right away
with open('train.json', 'r') as raw_data:
    data = json.load(raw_data)
    df = pd.DataFrame(data)

## Feature extraction

### Bag of Words example

In [3]:
from functools import reduce
import numpy as np

texts = [['i', 'have', 'a', 'cat'], 
        ['he', 'have', 'a', 'dog'], 
        ['he', 'and', 'i', 'have', 'a', 'cat', 'and', 'a', 'dog']]

dictionary = list(enumerate(set(list(reduce(lambda x, y: x + y, texts)))))

def vectorize(text): 
    vector = np.zeros(len(dictionary)) 
    for i, word in dictionary: 
        num = 0 
        for w in text: 
            if w == word: 
                num += 1 
        if num: 
            vector[i] = num 
    return vector

for t in texts: 
    print(vectorize(t))

[ 1.  1.  0.  1.  0.  0.  1.]
[ 0.  1.  1.  1.  0.  1.  0.]
[ 1.  1.  1.  2.  2.  1.  1.]


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(ngram_range=(1,1))

vect.fit_transform(['no i have cows', 'i have no cows']).toarray(), vect.vocabulary_

(array([[1, 1, 1],
        [1, 1, 1]], dtype=int64), {'cows': 0, 'have': 1, 'no': 2})

In [5]:
vect = CountVectorizer(ngram_range=(1,2))

vect.fit_transform(['no i have cows', 'i have no cows']).toarray(), vect.vocabulary_

(array([[1, 1, 1, 0, 1, 0, 1],
        [1, 1, 0, 1, 1, 1, 0]], dtype=int64),
 {'cows': 0,
  'have': 1,
  'have cows': 2,
  'have no': 3,
  'no': 4,
  'no cows': 5,
  'no have': 6})

### Working with dates

In [16]:
df['created'] = pd.to_datetime(df['created'])
df['dow'] = df['created'].apply(lambda x: x.date().weekday())
df['is_weekend'] = df['created'].apply(lambda x: 1 if x.date().weekday() in (5,6) else 0)

In [17]:
cols = ['dow', 'created', 'is_weekend']
df[cols].head()

Unnamed: 0,dow,created,is_weekend
10,4,2016-06-24 07:54:24,0
10000,6,2016-06-12 12:19:27,1
100004,6,2016-04-17 03:26:41,1
100007,0,2016-04-18 02:22:02,0
100013,3,2016-04-28 01:32:41,0


## Feature transformation