Skip to content

Commit

Permalink
Working model
Browse files Browse the repository at this point in the history
  • Loading branch information
zacstewart committed Oct 17, 2013
0 parents commit 54cdcca
Show file tree
Hide file tree
Showing 4 changed files with 149,815 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -0,0 +1,2 @@
data/*
venv
30 changes: 30 additions & 0 deletions CV performance.txt
@@ -0,0 +1,30 @@
tags, locations
num_votes: 0.20826522914
num_views: 0.0463900779609
num_comments: -0.00915516610715

tags, locations, desc length
num_votes: 0.213893936991
num_views: 0.0509179454511
num_comments: -0.00565685033158

tags, locations, desc length, 1-gram counts
num_votes: 0.194265113554
num_views: 0.0678427693326
num_comments: 0.0246830406565

tags, locations, desc length, 1-gram tfidft
num_votes: 0.194265113554
num_views: 0.0678427693326
num_comments: 0.0246830406565

tags, locations, desc length, 1,2,3-gram tfidft, month

tags, source, locations, desc length, 1,2,3-gram tfidft
num_votes: 0.264082779812
num_views: 0.0957141953819
num_comments: 0.033120785281

Ideas

* train/predict values per-month and then extrapolate
207 changes: 207 additions & 0 deletions estimator.py
@@ -0,0 +1,207 @@
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.cross_validation import KFold

PREDICTABLES = ['num_votes', 'num_views', 'num_comments']
CV = True

train = pd.io.parsers.read_csv('data/train.csv')
test = pd.io.parsers.read_csv('data/test.csv')

train['created_time'] = pd.to_datetime(train['created_time'])
test['created_time'] = pd.to_datetime(test['created_time'])

class FactorExtractor:
def __init__(self, factor):
self.factor = factor

def transform(self, data):
return [{self.factor: self.normalize(tt)} for tt in data[self.factor]]

def fit(self, *_):
return self

def normalize(self, tag):
if type(tag) != str: tag = '_MISSING_'
return tag

class Dictorizer:
def __init__(self, label):
self.label = label

def transform(self, data):
return [{self.label: value} for value in data]

def fit(self, *_):
return self

class LocationExtractor:
def transform(self, data):
return np.asarray(data[['latitude', 'longitude']])

def fit(self, *_):
return self

class DescriptionExtractor:
def transform(self, data):
return np.asarray(data['description']).astype(str)

def fit(self, *_):
return self

def normalize(self, desc):
if type(desc) != str: desc = ''
return desc

class LengthVectorizer:
VEC_LEN = np.vectorize(len)

def transform(self, data):
return self.VEC_LEN(data).astype(float)

def fit(self, *_):
return self;

class ArrayUpDimension:
def transform(self, data):
return data.reshape((data.shape[0], 1))

def fit(self, *_):
return self

class DateExtractor:
def transform(self, data):
return data['created_time']

def fit(self, *_):
return self

class MonthsSinceCreated:
def __init__(self, end_date):
self.end_date = end_date

def transform(self, data):
return np.array(map(self.difference, data))

def difference(self, date):
months = self.end_date.month - date.month
years = self.end_date.year - date.year
return 12 * years + months

def fit(self, *_):
return self

class MiniBatchKMeansTransformer:
def __init__(self, n_clusters):
self.model = MiniBatchKMeans(n_clusters = n_clusters)

def fit(self, data, _):
self.model.fit(data)
return self

def transform(self, data):
return self.model.predict(data)

tag_featurizer = Pipeline([
('tag_type_extractor', FactorExtractor('tag_type')),
('dict_vectorizer', DictVectorizer(sparse = False))
])

source_featurizer = Pipeline([
('source_extractor', FactorExtractor('source')),
('dict_vectorizer', DictVectorizer(sparse = False))
])

location_featurizer = Pipeline([
('location_extractor', LocationExtractor()),
('kmeans', MiniBatchKMeansTransformer(n_clusters = 8)),
('dicts', Dictorizer('location')),
('dict_vectorizer', DictVectorizer(sparse = False))
])

desc_length_featurizer = Pipeline([
('desc_extractor', DescriptionExtractor()),
('len_vectorizer', LengthVectorizer()),
('scaler', StandardScaler()),
('updim_array', ArrayUpDimension())
])

desc_ngrams_featurizer = Pipeline([
('desc_extractor', DescriptionExtractor()),
('count_vectorizer', CountVectorizer(ngram_range = (1, 3), encoding = 'cp1252')),
('tfidf_transformer', TfidfTransformer())
])

months_since_created = Pipeline([
('date_extractor', DateExtractor()),
('months_since', MonthsSinceCreated(pd.to_datetime('2013-09-20'))),
])

features = FeatureUnion([
('tag_features', tag_featurizer),
('source_featurs', source_featurizer),
('location_featurizer', location_featurizer),
('desc_length_featurizer', desc_length_featurizer),
('desc_tfidf_ngrams', desc_ngrams_featurizer),
])

predictor = SGDRegressor(verbose=1)

pipeline = Pipeline([
('feature_union', features),
('predictor', predictor)
])

import pdb; pdb.set_trace()

if CV:
k_fold = KFold(train.shape[0], 10, indices = True)

scores = dict(zip(PREDICTABLES, [[], [], []]))
fold_n = 0

for construct_idx, validate_idx in k_fold:
fold_n += 1
print 'Fold ' + str(fold_n)
construct = train.iloc[construct_idx]
validate = train.iloc[validate_idx]

for predictable in PREDICTABLES:
construct_months = months_since_created.transform(construct)
construct_targets = construct[predictable]

validate_months = months_since_created.transform(validate)
validate_targets = validate[predictable]

pipeline.fit(construct, construct_targets)
score = pipeline.score(validate, validate_targets)
scores[predictable].append(score)

for predictable in PREDICTABLES:
print predictable + ': ' + str(sum(scores[predictable]) / len(scores[predictable]))

submission = pd.DataFrame({'id': test['id']})

print "Building submission"
for predictable in PREDICTABLES:
train_months = months_since_created.transform(train)
train_target = train[predictable]

test_months = months_since_created.transform(test)

pipeline.fit(train, train_target)
predictions = pipeline.predict(test)

predictions[predictions < 0] = 0.0
predictions = np.around(predictions).astype(int)
submission[predictable] = predictions

submission.to_csv('submission.txt', index = False)

import pdb; pdb.set_trace()

0 comments on commit 54cdcca

Please sign in to comment.