Working model

zacstewart · Oct 17, 2013 · 54cdcca · 54cdcca
commit 54cdcca
Show file tree

Hide file tree

Showing 4 changed files with 149,815 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+data/*
+venv
diff --git a/CV performance.txt b/CV performance.txt
@@ -0,0 +1,30 @@
+tags, locations
+num_votes:    0.20826522914
+num_views:    0.0463900779609
+num_comments: -0.00915516610715
+
+tags, locations, desc length
+num_votes: 0.213893936991
+num_views: 0.0509179454511
+num_comments: -0.00565685033158
+
+tags, locations, desc length, 1-gram counts
+num_votes: 0.194265113554
+num_views: 0.0678427693326
+num_comments: 0.0246830406565
+
+tags, locations, desc length, 1-gram tfidft
+num_votes: 0.194265113554
+num_views: 0.0678427693326
+num_comments: 0.0246830406565
+
+tags, locations, desc length, 1,2,3-gram tfidft, month
+
+tags, source, locations, desc length, 1,2,3-gram tfidft
+num_votes: 0.264082779812
+num_views: 0.0957141953819
+num_comments: 0.033120785281
+
+Ideas
+
+* train/predict values per-month and then extrapolate
diff --git a/estimator.py b/estimator.py
@@ -0,0 +1,207 @@
+import pandas as pd
+import numpy as np
+from sklearn.feature_extraction import DictVectorizer
+from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
+from sklearn.preprocessing import StandardScaler
+from sklearn.cluster import MiniBatchKMeans
+from sklearn.linear_model import SGDRegressor
+from sklearn.pipeline import Pipeline, FeatureUnion
+from sklearn.cross_validation import KFold
+
+PREDICTABLES = ['num_votes', 'num_views', 'num_comments']
+CV = True
+
+train = pd.io.parsers.read_csv('data/train.csv')
+test  = pd.io.parsers.read_csv('data/test.csv')
+
+train['created_time'] = pd.to_datetime(train['created_time'])
+test['created_time']  = pd.to_datetime(test['created_time'])
+
+class FactorExtractor:
+  def __init__(self, factor):
+    self.factor = factor
+
+  def transform(self, data):
+    return [{self.factor: self.normalize(tt)} for tt in data[self.factor]]
+
+  def fit(self, *_):
+    return self
+
+  def normalize(self, tag):
+    if type(tag) != str: tag = '_MISSING_'
+    return tag
+
+class Dictorizer:
+  def __init__(self, label):
+    self.label = label
+
+  def transform(self, data):
+    return [{self.label: value} for value in data]
+
+  def fit(self, *_):
+    return self
+
+class LocationExtractor:
+  def transform(self, data):
+    return np.asarray(data[['latitude', 'longitude']])
+
+  def fit(self, *_):
+    return self
+
+class DescriptionExtractor:
+  def transform(self, data):
+    return np.asarray(data['description']).astype(str)
+
+  def fit(self, *_):
+    return self
+
+  def normalize(self, desc):
+    if type(desc) != str: desc = ''
+    return desc
+
+class LengthVectorizer:
+  VEC_LEN = np.vectorize(len)
+
+  def transform(self, data):
+    return self.VEC_LEN(data).astype(float)
+
+  def fit(self, *_):
+    return self;
+
+class ArrayUpDimension:
+  def transform(self, data):
+    return data.reshape((data.shape[0], 1))
+
+  def fit(self, *_):
+    return self
+
+class DateExtractor:
+  def transform(self, data):
+    return data['created_time']
+
+  def fit(self, *_):
+    return self
+
+class MonthsSinceCreated:
+  def __init__(self, end_date):
+    self.end_date = end_date
+
+  def transform(self, data):
+    return np.array(map(self.difference, data))
+
+  def difference(self, date):
+    months = self.end_date.month - date.month
+    years  = self.end_date.year - date.year
+    return 12 * years + months
+
+  def fit(self, *_):
+    return self
+
+class MiniBatchKMeansTransformer:
+  def __init__(self, n_clusters):
+    self.model = MiniBatchKMeans(n_clusters = n_clusters)
+
+  def fit(self, data, _):
+    self.model.fit(data)
+    return self
+
+  def transform(self, data):
+    return self.model.predict(data)
+
+tag_featurizer = Pipeline([
+  ('tag_type_extractor',  FactorExtractor('tag_type')),
+  ('dict_vectorizer',     DictVectorizer(sparse = False))
+])
+
+source_featurizer = Pipeline([
+  ('source_extractor',  FactorExtractor('source')),
+  ('dict_vectorizer',   DictVectorizer(sparse = False))
+])
+
+location_featurizer = Pipeline([
+  ('location_extractor',  LocationExtractor()),
+  ('kmeans',              MiniBatchKMeansTransformer(n_clusters = 8)),
+  ('dicts',               Dictorizer('location')),
+  ('dict_vectorizer',     DictVectorizer(sparse = False))
+])
+
+desc_length_featurizer = Pipeline([
+  ('desc_extractor',  DescriptionExtractor()),
+  ('len_vectorizer',  LengthVectorizer()),
+  ('scaler',          StandardScaler()),
+  ('updim_array',     ArrayUpDimension())
+])
+
+desc_ngrams_featurizer = Pipeline([
+  ('desc_extractor',    DescriptionExtractor()),
+  ('count_vectorizer',  CountVectorizer(ngram_range = (1, 3), encoding = 'cp1252')),
+  ('tfidf_transformer', TfidfTransformer())
+])
+
+months_since_created = Pipeline([
+  ('date_extractor',  DateExtractor()),
+  ('months_since',    MonthsSinceCreated(pd.to_datetime('2013-09-20'))),
+])
+
+features = FeatureUnion([
+  ('tag_features',            tag_featurizer),
+  ('source_featurs',          source_featurizer),
+  ('location_featurizer',     location_featurizer),
+  ('desc_length_featurizer',  desc_length_featurizer),
+  ('desc_tfidf_ngrams',       desc_ngrams_featurizer),
+])
+
+predictor = SGDRegressor(verbose=1)
+
+pipeline = Pipeline([
+  ('feature_union',  features),
+  ('predictor',      predictor)
+])
+
+import pdb; pdb.set_trace()
+
+if CV:
+  k_fold = KFold(train.shape[0], 10, indices = True)
+
+  scores = dict(zip(PREDICTABLES, [[], [], []]))
+  fold_n = 0
+
+  for construct_idx, validate_idx in k_fold:
+    fold_n += 1
+    print 'Fold ' + str(fold_n)
+    construct = train.iloc[construct_idx]
+    validate  = train.iloc[validate_idx]
+
+    for predictable in PREDICTABLES:
+      construct_months = months_since_created.transform(construct)
+      construct_targets = construct[predictable]
+
+      validate_months = months_since_created.transform(validate)
+      validate_targets = validate[predictable]
+
+      pipeline.fit(construct, construct_targets)
+      score = pipeline.score(validate, validate_targets)
+      scores[predictable].append(score)
+
+  for predictable in PREDICTABLES:
+    print predictable + ': ' + str(sum(scores[predictable]) / len(scores[predictable]))
+
+submission = pd.DataFrame({'id': test['id']})
+
+print "Building submission"
+for predictable in PREDICTABLES:
+  train_months = months_since_created.transform(train)
+  train_target = train[predictable]
+
+  test_months = months_since_created.transform(test)
+
+  pipeline.fit(train, train_target)
+  predictions = pipeline.predict(test)
+
+  predictions[predictions < 0] = 0.0
+  predictions = np.around(predictions).astype(int)
+  submission[predictable] = predictions
+
+submission.to_csv('submission.txt', index = False)
+
+import pdb; pdb.set_trace()