# Kaggle Stumbleupon dataset

This data is stumbleupon links, and we are trying to see if they are evergreen or ephemeral.

In [17]:
# import statements
# the standard stuff
import numpy as np
import pandas as pd
import nltk
import json
import sklearn.feature_extraction.text as sktext
import string
import sklearn.naive_bayes as nb
import sklearn.svm as svm

import sklearn.cross_validation as cross_validation
import sklearn.metrics as metrics

In [18]:
def cv_loop(X, labels, model, K):
    '''
    Cross validation: for K iterations, split the data into train and test
    sets, build a model, and return the mean AUC.
    '''
    
    SEED = 15
    mean_auc = 0.
    for i in range(K):
        X_train, X_cv, labels_train, labels_cv = cross_validation.train_test_split(
            X, labels, test_size = 0.2,
            random_state = i*SEED)
        model.fit(X_train, labels_train)
        preds = model.predict_proba(X_cv)[:,1]
        auc = metrics.roc_auc_score(labels_cv, preds)
        print("AUC (fold %d/%d): %f" % (i + 1, K, auc))
        mean_auc += auc
    return mean_auc/K

In [22]:
# read in the data
training_data = pd.read_csv("./data/train.csv", na_values="?")
testing_data  = pd.read_csv("./data/test.csv", na_values="?")
print(testing_data.shape)
testing_data.head()

(2958, 27)


Unnamed: 0.1,Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,...,image_ratio,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio
0,4646,http://sportsillustrated.cnn.com/2011_swimsuit...,7668,"{""title"":""Kate Upton Swimsuit by Kikidoll 2011...",,,1.158209,0.505917,0.428994,0.023669,...,1.014599,,0,78,,547,338,4,0.005917,0.053571
1,3110,http://lungcancercauses.org/infographics/how-t...,4836,"{""title"":""How to Prevent Cancer for FREE Infog...",health,0.757862,1.6,0.3,0.3,0.3,...,-1.0,,0,30,0.0,188,10,7,0.1,0.0625
2,467,http://www.melskitchencafe.com/2011/04/penne-w...,5920,"{""url"":""melskitchencafe 2011 04 penne with roa...",,,1.794118,0.17341,0.086705,0.052023,...,-1.0,,1,6,,12792,173,7,0.052023,0.111111
3,526,http://www.npr.org/blogs/thetwo-way/2009/09/ba...,1467,"{""title"":""Aw Cutie Pie Sends Baseball Back Whe...",sports,0.381499,1.981308,0.581967,0.245902,0.061475,...,0.078014,1.0,0,55,0.0,1634,244,9,0.278689,0.095541
4,2194,http://www.buzzfeed.com/mjs538/the-10-worst-mo...,5964,"{""title"":""The 10 Worst Moose Knuckles At The O...",sports,0.621481,1.38191,0.406863,0.05719,0.02451,...,2.478261,1.0,1,20,0.0,10422,612,5,0.26634,0.067568


In [20]:
# the goal is to create a bag of words model out of the data
# basically reformat the data to only care about the body text data


training_data_bow = [[item[2], json.loads(item[3])["body"], item[-1]] for item in training_data.as_matrix()]

tr_urlids = [x[0] for x in training_data_bow]
tr_text = [x[1] if x[1] is not None else "" for x in training_data_bow]
tr_label = [x[2] for x in training_data_bow]

# remove punctuation and stop words
punctuation_dict = dict((ord(char), None) for char in string.punctuation)
tr_text_no_punc = [x.translate(punctuation_dict) for x in tr_text]


# this was pretty much taken straight off the text tutorial on sklearn
count_vec = sktext.CountVectorizer()
tr_text_wc = count_vec.fit_transform(tr_text_no_punc)

tfidf_tf = sktext.TfidfTransformer()
tr_text_tfidf = tfidf_tf.fit_transform(tr_text_wc)

# create and test the model
model = nb.MultinomialNB()

cv_loop(tr_text_tfidf, tr_label, model, 10)

AUC (fold 1/10): 0.852821
AUC (fold 2/10): 0.850404
AUC (fold 3/10): 0.853193
AUC (fold 4/10): 0.870312
AUC (fold 5/10): 0.863857
AUC (fold 6/10): 0.860326
AUC (fold 7/10): 0.876845
AUC (fold 8/10): 0.879162
AUC (fold 9/10): 0.877240
AUC (fold 10/10): 0.844856


0.86290162535772086

In [28]:
# Well that looks good, lets fit the model completely and then make our predictions
model.fit(tr_text_tfidf, tr_label)

testing_data_bow = [[item[2], json.loads(item[3])["body"]] for item in testing_data.as_matrix()]
print(len(testing_data_bow))

test_urlids = [x[0] for x in testing_data_bow]
test_text = [x[1] if x[1] is not None else "" for x in testing_data_bow]

print(len(test_urlids))
print(len(test_text))

test_text_no_punc = [x.translate(punctuation_dict) for x in test_text]

test_text_wc = count_vec.transform(test_text_no_punc)

test_tfidf_tf = sktext.TfidfTransformer()
test_text_tfidf = test_tfidf_tf.fit_transform(test_text_wc)

preds = model.predict_proba(test_text_tfidf)[:,1]

# combine the predictions and url IDs into a pandas dataframe
# Note: this is easy with Python's dictionaries, but make sure to set
# the index to url ID (otherwise the column orders will be random. Reason=dictionaries are unordered)
pred_df = pd.DataFrame({'label':preds, 'urlid': test_urlids}).set_index('urlid')
print(pred_df.shape)
pred_df.to_csv('evergreen_submission_1')

2958
2958
2958
(2958, 1)
