Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 54cdcca
Showing
4 changed files
with
149,815 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
data/* | ||
venv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
tags, locations | ||
num_votes: 0.20826522914 | ||
num_views: 0.0463900779609 | ||
num_comments: -0.00915516610715 | ||
|
||
tags, locations, desc length | ||
num_votes: 0.213893936991 | ||
num_views: 0.0509179454511 | ||
num_comments: -0.00565685033158 | ||
|
||
tags, locations, desc length, 1-gram counts | ||
num_votes: 0.194265113554 | ||
num_views: 0.0678427693326 | ||
num_comments: 0.0246830406565 | ||
|
||
tags, locations, desc length, 1-gram tfidft | ||
num_votes: 0.194265113554 | ||
num_views: 0.0678427693326 | ||
num_comments: 0.0246830406565 | ||
|
||
tags, locations, desc length, 1,2,3-gram tfidft, month | ||
|
||
tags, source, locations, desc length, 1,2,3-gram tfidft | ||
num_votes: 0.264082779812 | ||
num_views: 0.0957141953819 | ||
num_comments: 0.033120785281 | ||
|
||
Ideas | ||
|
||
* train/predict values per-month and then extrapolate |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,207 @@ | ||
import pandas as pd | ||
import numpy as np | ||
from sklearn.feature_extraction import DictVectorizer | ||
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer | ||
from sklearn.preprocessing import StandardScaler | ||
from sklearn.cluster import MiniBatchKMeans | ||
from sklearn.linear_model import SGDRegressor | ||
from sklearn.pipeline import Pipeline, FeatureUnion | ||
from sklearn.cross_validation import KFold | ||
|
||
PREDICTABLES = ['num_votes', 'num_views', 'num_comments'] | ||
CV = True | ||
|
||
train = pd.io.parsers.read_csv('data/train.csv') | ||
test = pd.io.parsers.read_csv('data/test.csv') | ||
|
||
train['created_time'] = pd.to_datetime(train['created_time']) | ||
test['created_time'] = pd.to_datetime(test['created_time']) | ||
|
||
class FactorExtractor: | ||
def __init__(self, factor): | ||
self.factor = factor | ||
|
||
def transform(self, data): | ||
return [{self.factor: self.normalize(tt)} for tt in data[self.factor]] | ||
|
||
def fit(self, *_): | ||
return self | ||
|
||
def normalize(self, tag): | ||
if type(tag) != str: tag = '_MISSING_' | ||
return tag | ||
|
||
class Dictorizer: | ||
def __init__(self, label): | ||
self.label = label | ||
|
||
def transform(self, data): | ||
return [{self.label: value} for value in data] | ||
|
||
def fit(self, *_): | ||
return self | ||
|
||
class LocationExtractor: | ||
def transform(self, data): | ||
return np.asarray(data[['latitude', 'longitude']]) | ||
|
||
def fit(self, *_): | ||
return self | ||
|
||
class DescriptionExtractor: | ||
def transform(self, data): | ||
return np.asarray(data['description']).astype(str) | ||
|
||
def fit(self, *_): | ||
return self | ||
|
||
def normalize(self, desc): | ||
if type(desc) != str: desc = '' | ||
return desc | ||
|
||
class LengthVectorizer: | ||
VEC_LEN = np.vectorize(len) | ||
|
||
def transform(self, data): | ||
return self.VEC_LEN(data).astype(float) | ||
|
||
def fit(self, *_): | ||
return self; | ||
|
||
class ArrayUpDimension: | ||
def transform(self, data): | ||
return data.reshape((data.shape[0], 1)) | ||
|
||
def fit(self, *_): | ||
return self | ||
|
||
class DateExtractor: | ||
def transform(self, data): | ||
return data['created_time'] | ||
|
||
def fit(self, *_): | ||
return self | ||
|
||
class MonthsSinceCreated: | ||
def __init__(self, end_date): | ||
self.end_date = end_date | ||
|
||
def transform(self, data): | ||
return np.array(map(self.difference, data)) | ||
|
||
def difference(self, date): | ||
months = self.end_date.month - date.month | ||
years = self.end_date.year - date.year | ||
return 12 * years + months | ||
|
||
def fit(self, *_): | ||
return self | ||
|
||
class MiniBatchKMeansTransformer: | ||
def __init__(self, n_clusters): | ||
self.model = MiniBatchKMeans(n_clusters = n_clusters) | ||
|
||
def fit(self, data, _): | ||
self.model.fit(data) | ||
return self | ||
|
||
def transform(self, data): | ||
return self.model.predict(data) | ||
|
||
tag_featurizer = Pipeline([ | ||
('tag_type_extractor', FactorExtractor('tag_type')), | ||
('dict_vectorizer', DictVectorizer(sparse = False)) | ||
]) | ||
|
||
source_featurizer = Pipeline([ | ||
('source_extractor', FactorExtractor('source')), | ||
('dict_vectorizer', DictVectorizer(sparse = False)) | ||
]) | ||
|
||
location_featurizer = Pipeline([ | ||
('location_extractor', LocationExtractor()), | ||
('kmeans', MiniBatchKMeansTransformer(n_clusters = 8)), | ||
('dicts', Dictorizer('location')), | ||
('dict_vectorizer', DictVectorizer(sparse = False)) | ||
]) | ||
|
||
desc_length_featurizer = Pipeline([ | ||
('desc_extractor', DescriptionExtractor()), | ||
('len_vectorizer', LengthVectorizer()), | ||
('scaler', StandardScaler()), | ||
('updim_array', ArrayUpDimension()) | ||
]) | ||
|
||
desc_ngrams_featurizer = Pipeline([ | ||
('desc_extractor', DescriptionExtractor()), | ||
('count_vectorizer', CountVectorizer(ngram_range = (1, 3), encoding = 'cp1252')), | ||
('tfidf_transformer', TfidfTransformer()) | ||
]) | ||
|
||
months_since_created = Pipeline([ | ||
('date_extractor', DateExtractor()), | ||
('months_since', MonthsSinceCreated(pd.to_datetime('2013-09-20'))), | ||
]) | ||
|
||
features = FeatureUnion([ | ||
('tag_features', tag_featurizer), | ||
('source_featurs', source_featurizer), | ||
('location_featurizer', location_featurizer), | ||
('desc_length_featurizer', desc_length_featurizer), | ||
('desc_tfidf_ngrams', desc_ngrams_featurizer), | ||
]) | ||
|
||
predictor = SGDRegressor(verbose=1) | ||
|
||
pipeline = Pipeline([ | ||
('feature_union', features), | ||
('predictor', predictor) | ||
]) | ||
|
||
import pdb; pdb.set_trace() | ||
|
||
if CV: | ||
k_fold = KFold(train.shape[0], 10, indices = True) | ||
|
||
scores = dict(zip(PREDICTABLES, [[], [], []])) | ||
fold_n = 0 | ||
|
||
for construct_idx, validate_idx in k_fold: | ||
fold_n += 1 | ||
print 'Fold ' + str(fold_n) | ||
construct = train.iloc[construct_idx] | ||
validate = train.iloc[validate_idx] | ||
|
||
for predictable in PREDICTABLES: | ||
construct_months = months_since_created.transform(construct) | ||
construct_targets = construct[predictable] | ||
|
||
validate_months = months_since_created.transform(validate) | ||
validate_targets = validate[predictable] | ||
|
||
pipeline.fit(construct, construct_targets) | ||
score = pipeline.score(validate, validate_targets) | ||
scores[predictable].append(score) | ||
|
||
for predictable in PREDICTABLES: | ||
print predictable + ': ' + str(sum(scores[predictable]) / len(scores[predictable])) | ||
|
||
submission = pd.DataFrame({'id': test['id']}) | ||
|
||
print "Building submission" | ||
for predictable in PREDICTABLES: | ||
train_months = months_since_created.transform(train) | ||
train_target = train[predictable] | ||
|
||
test_months = months_since_created.transform(test) | ||
|
||
pipeline.fit(train, train_target) | ||
predictions = pipeline.predict(test) | ||
|
||
predictions[predictions < 0] = 0.0 | ||
predictions = np.around(predictions).astype(int) | ||
submission[predictable] = predictions | ||
|
||
submission.to_csv('submission.txt', index = False) | ||
|
||
import pdb; pdb.set_trace() |
Oops, something went wrong.