In [85]:
import pandas
import datetime
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import KFold
from sklearn.metrics import *
from sklearn.grid_search import GridSearchCV
from collections import OrderedDict

In [8]:
def parse_age(x):
    time, resolution = x.split(' ')
    time = int(time)
    if resolution.startswith('day'):
        return int(time)
    elif resolution.startswith('week'):
        return int(time * 7)
    elif resolution.startswith('month'):
        return int(time * 30)
    elif resolution.startswith('year'):
        return int(time * 365)
    else:
        print(x)

In [59]:
class FeatureEncoder(object):
    def __init__(self):
        super(FeatureEncoder, self).__init__()
        self.feature_dict = OrderedDict()
    
    def fit(self, raw_data):
        self.feature_dict['AnimalType'] = lambda x: le.fit_transform(x.AnimalType)
        self.feature_dict['month'] = lambda x: \
        [datetime.datetime.strptime(xi, "%Y-%m-%d %H:%M:%S").date().month / 12 \
         for xi in x.DateTime]
        self.feature_dict['season'] = lambda x: \
        [datetime.datetime.strptime(xi, "%Y-%m-%d %H:%M:%S").date().month // 4 / 12 \
         for xi in x.DateTime]
        
        self.feature_dict['10_day'] = lambda x: \
        [datetime.datetime.strptime(xi, "%Y-%m-%d %H:%M:%S").date().day // 10 / 3\
         for xi in x.DateTime]
        
        self.feature_dict['week'] = lambda x: \
        [datetime.datetime.strptime(xi, "%Y-%m-%d %H:%M:%S").date().day // 7 / 5 \
         for xi in x.DateTime]
        self.feature_dict['Age_weeks'] = lambda x:[parse_age(i) // 7 / 54 for i in x.AgeuponOutcome]
        mlb.fit([[xi] for xi in raw_data['AgeuponOutcome']])
        for i in mlb.classes_:
            self.feature_dict['AgeuponOutcome__' + i] = lambda x,i:[int(j == i) for j in x.AgeuponOutcome]
        mlb.fit([[xi] for xi in raw_data['SexuponOutcome']])
        for i in mlb.classes_:
            self.feature_dict['SexuponOutcome__' + i] = lambda x,i:[int(j == i) for j in x.SexuponOutcome]
        
        breed_count = raw_data.Breed.value_counts()
        mlb.fit([[xi] for xi in raw_data['Breed']])
        for i in mlb.classes_:
            if breed_count.loc[i] > 10:
                self.feature_dict['Breed__' + i] = lambda x,i:[int(j == i) for j in x.Breed]
        del breed_count
    
        color_count = raw_data.Color.value_counts()
        mlb.fit([[xi] for xi in raw_data['Color']])
        for i in mlb.classes_:
            if color_count.loc[i] > 10:
                self.feature_dict['Color__' + i] = lambda x,i:[int(j == i) for j in x.Color]
        del color_count
   
        self.feature_dict['is_breed_mix'] = lambda x: [int(i.find('Mix') > 0) for i in x.Breed]
        self.feature_dict['is_color_mix'] = lambda x: [int(i.find('/') > 0) for i in x.Color]
        
    def transform(self, raw_data):
        transed_data = pandas.DataFrame()
        for key, func in self.feature_dict.items():
            if key.find('__')>0:
                transed_data[key] = func(raw_data, key.split('__')[1])
            else:
                transed_data[key] = func(raw_data)
        return transed_data
        
    def fit_transform(self, raw_data):
        self.fit(raw_data)
        return self.transform(raw_data)

In [60]:
mlb = MultiLabelBinarizer()
le = LabelEncoder()

In [61]:
data = pandas.read_csv('train.csv')
data['SexuponOutcome'].fillna(inplace = True, value = data['SexuponOutcome'].describe().top)
data['AgeuponOutcome'].fillna(inplace = True, value = data['AgeuponOutcome'].describe().top)

In [62]:
feature_encoder = FeatureEncoder()

In [63]:
train_data = feature_encoder.fit_transform(data)

In [64]:
train_data['label'] = le.fit_transform(data['OutcomeType']).astype(int)

In [65]:
for i in train_data.iterrows():
    print(i[1][-1])
    break

3.0


In [86]:
def gen_output(raw_data, file_name, target):
    file = open(file_name, 'w')
    field_num = 0
    fields = {}
    feature_num = 0
    col = raw_data.columns
    
    for i in col:
        if i.find('__') > 0:
            key = i.split('__')[0]
        else:
            key = i
            
        if key in fields:
            continue
        else:
            field_num += 1
            fields[key] = field_num
            
            
    for each_row in raw_data.iterrows():
        sample = []
        if (each_row[1][-1] == target):
            label = 1
        else:
            label = 0
            
        if target<0:
            sample.append(str(label))
        
        for i,p in zip(col, range(len(col))):
            val = each_row[1][p]
            if (i == 'label'):
                continue
            else:
                if (i.find('__')>0):
                    fe_num = fields[i.split('__')[0]]
                else:
                    fe_num = fields[i]
    
                sample.append('{}:{}:{}'.format(fe_num, p+1, val))
        file.write(' '.join(sample) + '\n')

In [87]:
test_set = pandas.read_csv('test.csv')
test_set['SexuponOutcome'].fillna(inplace = True, value = data['SexuponOutcome'].describe().top)
test_set['AgeuponOutcome'].fillna(inplace = True, value = data['AgeuponOutcome'].describe().top)
test_data = feature_encoder.transform(test_set)

In [None]:
for i in range(5):
    gen_output(train_data, '{}_train_ffm'.format(i), i)

In [88]:
gen_output(test_data, 'test_ffm', -1)