### Use this markdown to generate the predictors


In [93]:
#imported libraries
import pandas as pd
import numpy as np
import scipy as sp
import math
import matplotlib.pyplot as plt
import langdetect
import datetime
%matplotlib inline  
from sklearn import linear_model
from sklearn.feature_selection import RFE
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

#Creating of the input data
downloads = pd.read_csv('train_app_downloads.csv')
reviews = pd.read_csv('train_app_review.csv')
ratings = pd.read_csv('train_app_rating.csv')
usages = pd.read_csv('train_usage.csv')
revenues = pd.read_csv('train_revenue.csv')
output = pd.read_csv('train_final_downloads.csv')
dateRange = pd.date_range('2015-03-01', periods=56).format(formatter=lambda x: x.strftime('%Y-%m-%d'))
sentiment.values[sentiment.values==0.0] = 0.5
reviews["sentiment_score"] = sentiment.values 

#We map -1 to 0 in the downloads (there are no 0 in the initial data)
replacementValue=0
downloads = downloads.replace(-1,replacementValue)

#Minor corrections
ratings = ratings.rename(columns={'start1': 'star1'})
ratings = pd.merge(downloads.drop(dateRange,1), ratings.drop('Unnamed: 0', 1), how='left',
                   on=["id","name","category"]).replace("NaN",replacementValue)

#### Initialize the predictors matrix

In [94]:
predictors = pd.concat([downloads["id"],downloads["name"],downloads["category"],downloads["device"]],
                       axis=1,keys=["id","name","category","device"])
predictors.head()

Unnamed: 0,id,name,category,device
0,281704574,"AIM: Chat, Free Text, Photo Share, Voice Message",Social Networking,iphone
1,281922769,Mobile MIM,Medical,ipad
2,281922769,Mobile MIM,Medical,iphone
3,281935788,Epocrates References & Tools for Healthcare Pr...,Medical,ipad
4,281935788,Epocrates References & Tools for Healthcare Pr...,Medical,iphone


#### Use the train_app_downloads.csv file

In [95]:
#Generate the logWeekly average
def generator_weekAvg(inp,w):
    if (np.count_nonzero(inp[5+w*7:12+w*7] - replacementValue*np.ones(len(inp[5+w*7:12+w*7]))) == 0):
        return 0
    return  math.log(1.0*sum(inp[5+w*7:12+w*7])/np.count_nonzero(inp[5+w*7:12+w*7] 
                                                                 - replacementValue*np.ones(len(inp[5+w*7:12+w*7]))))

for w in range(8):
    predictors["week_"+str(w+1)] = downloads.apply(generator_weekAvg,axis=1,args=(w,))

In [96]:
#Generate the daily average
def generator_dailyAvg(inp):
    if (np.count_nonzero(inp[5:] - replacementValue*np.ones(len(inp[5:]))) == 0):
        return 0
    return  (1.0*sum(inp[5:])/np.count_nonzero(inp[5:] - replacementValue*np.ones(len(inp[5:]))))
    #return  math.log(1.0*sum(inp[5:])/len(inp[5:]))

predictors["daily_avg"] = downloads.apply(generator_dailyAvg,axis=1)
#This one is bad

In [97]:
#Generate the polynomial coefficients
def generator_coef(inp, coef):
    return  np.polyfit(range(56),inp[5:],coef)[0]
#Redo by ignoring the -1 / 0 ? w = [1110011]     w = np.not_equal(inp[5:],np.ones(len(inp[5:]))*replacementValue

for c in range(4):
    predictors["coef_"+str(c)] = downloads.apply(generator_coef,axis=1,args=(c,))

In [98]:
#Generate the step max and min (we ignore the values of 0)
def generator_maxStep(inp,maximum):
    if (np.count_nonzero(inp[5:] - replacementValue*np.ones(len(inp[5:]))) == 0):
        return 0
    m = 0
    for d in range(1,56):
        if (inp[5+d]!=replacementValue and inp[4+d]!=replacementValue):
            c = (inp[5+d]-inp[4+d])
            if (maximum and m < c):
                m = c
            if ( not maximum and m > c):
                m = c
    return m
                
predictors["maxStep"] = downloads.apply(generator_maxStep,axis=1,args=(True,))
predictors["minStep"] = downloads.apply(generator_maxStep,axis=1,args=(False,))

In [99]:
#Standard Deviation
def generator_std(inp):
    if (np.count_nonzero(inp[5:] - replacementValue*np.ones(len(inp[5:]))) == 0):
        return 0
    return np.std(inp[5:])

predictors["std"] = downloads.apply(generator_std,axis = 1)

In [100]:
#Number of missing Values
def generator_missing(inp):
    return list(inp[5:]).count(replacementValue)
    #return np.count_zero(inp[5:] -replacementValue*np.ones(len(inp[5:])))

predictors["nb_missing"] = downloads.apply(generator_missing,axis = 1)

In [101]:
#device
def generator_iphone(inp):
    if (inp[4] == "iphone"):
        return 1
    else:
        return 0

def generator_ipad(inp):
    if(inp[4] == "ipad"):
        return 1
    else:
        return 0
    
predictors["iphone"] = downloads.apply(generator_iphone,axis = 1)
predictors["ipad"] = downloads.apply(generator_ipad,axis = 1)

In [102]:
#Categories
def generator_categories(inp,cat):
    if (inp[3] == cat):
        return 1
    else:
        return 0
    

for cat in list(set(downloads["category"])):
    predictors[cat] = downloads.apply(generator_categories,axis = 1,args=(cat,))

In [103]:
def detect_language(x):
    try:
        detected = langdetect.detect(x.decode('utf8','ignore'))
        if detected in ['ja']:
            return 'japanese'
        elif detected in ['zh-cn']:
            return 'chinese'
        elif detected in ['ko']:
            return 'korean'
        elif detected in ['en']:
            return 'english'
        return 'other'
    except:
        return None

def set_lang_categories(x, cat):
    if x == cat:
        return 1
    else:
        return 0
    
lang_series = predictors['name'].apply(detect_language)
for cat in list(set(lang_series)):
    predictors[cat] = lang_series.apply(set_lang_categories, args=(cat,))
    

#### Use the train_cumulative_downloads_2015-02.csv file

In [104]:
prev_downloads = pd.read_csv('train_cumulative_downloads_2015-02.csv').drop('Unnamed: 0', 1)
predictors = pd.merge(predictors, prev_downloads, how='left',
                  on=["id","device"])

#### Use the train_app_rating.csv file and reviews

In [105]:
avg_reviews = reviews.groupby('id').agg('mean')
avg_reviews['rating']
predictors['avg_review'] = predictors.join(avg_reviews['rating'],on='id')['rating'].replace("NaN",0)


In [106]:
avg_reviews = reviews.groupby('id').agg('var')
avg_reviews['rating']
predictors['var_review'] = predictors.join(avg_reviews['rating'],on='id')['rating'].replace("NaN",0)


In [107]:
ratings['num_ratings'] = ratings.ix[:,['star1','star2','star3','star4','star5']].sum(axis=1)
#Scaling the ratings
for i in range(1,6):
    ratings['star'+str(i)]=ratings['star'+str(i)].divide(ratings['num_ratings']+1)

In [108]:
#raw ratings
predictors = pd.merge(predictors, ratings.drop('Unnamed: 0', 1), how='left',
                   on=["id","name","category","device"]).replace("NaN",replacementValue)


In [125]:
#Reviews per daily downloads
#predictors['reviewPerDailyDownloads'] = predictors['num_ratings'].divide(predictors['daily_avg']+1)
predictors['ratings_per_daily_downloads'] = predictors['num_ratings'].divide(predictors['cumulative_downloads_2015-02']+1)

#Drop either this or num_ratings

In [110]:
num_versions = reviews.groupby('id').version.nunique()
num_versions.name = 'num_versions'
predictors = predictors.join(num_versions, how='left', on='id')
predictors['num_versions'] = predictors['num_versions'].replace('NaN',0)

In [111]:
num_review = reviews.groupby('id').rating.count()
num_review.name = 'num_review'
predictors = predictors.join(num_review, how='left', on='id')
predictors['num_review'] = predictors['num_review'].replace('NaN',0)

#### Use the train_release_date.csv file

In [112]:
def generate_days_since_release(x):
    return (datetime.datetime.strptime('2015-03-01', '%Y-%m-%d').date() - datetime.datetime.strptime(x, '%Y-%m-%d').date()).days

release_date = pd.read_csv('train_release_date.csv').drop('Unnamed: 0', 1)
release_date['days_since_release'] = release_date['release_date'].apply(generate_days_since_release)
predictors = pd.merge(predictors, release_date.drop('release_date', 1), how='left',
                  on=["id","name"])

# replacing any missing values with 0; not a good idea but placeholder for now
predictors['days_since_release'] = predictors['days_since_release'].replace('NaN', 0)

# The number of downloads divided by time
predictors['downloads_per_day_before'] = predictors['cumulative_downloads_2015-02'].divide(predictors['days_since_release']+1)

#### Use the sentiment score

In [113]:
#Later we can compute the weighted average of sentiment scores based on reviewers.
#add positive and negative columns to indicate the app's popularity

#print reviews.ix[id==predictors.id[0],:]["sentiment_score"]
#print reviews.ix[id==predictors.id[1],:]["sentiment_score"]

avg_score = [0]*predictors.shape[0]
predictors["positive"] = [0]*predictors.shape[0]
predictors["negative"] = [0]*predictors.shape[0]
for i in range(predictors.shape[0]):
    avg_score[i] = reviews.ix[reviews["id"]==predictors.id[i],:]["sentiment_score"].mean()
    if avg_score[i]>0.55: 
        predictors["positive"].values[i] = 1
    elif avg_score[i]<0.45: predictors["negative"].values[i] = 1
#predictors["avg_sentiment_score"] = avg_score

 

#### Use coeficients of metrics 

In [114]:
for i in range(4):
    predictors["m"+str(i+1)+"_max"] = np.zeros(predictors.shape[0]) 
    predictors["m"+str(i+1)+"_min"] = np.zeros(predictors.shape[0])
    predictors["m"+str(i+1)+"_mean"] = np.zeros(predictors.shape[0])
    predictors["m"+str(i+1)+"_std"] = np.zeros(predictors.shape[0])
    for j in range(3):
        predictors["m"+str(i+1)+"_coef_"+str(j)] = np.zeros(predictors.shape[0])

for i in range(predictors.shape[0]):
    if predictors["id"].values[i] not in usages["id"].values: continue
    for j in range(4):
        tmp = usages.ix[usages["id"] == predictors["id"].values[i],:]
        time_series = np.array(tmp.ix[tmp["metric"] == j+1,6:14])[0]   
        if -1 in time_series: continue
        X = np.array(range(8))
        fit = np.polyfit(X,time_series,2)
        for k in range(3):
            predictors["m"+str(j+1)+"_coef_"+str(k)].values[i] = fit[2-k]


#### Use coeficients of revenue

In [116]:
for j in range(3):
    predictors["rev_coef_"+str(j)] = np.zeros(predictors.shape[0])
    predictors["rev_max"] = np.zeros(predictors.shape[0])
    predictors["rev_min"] = np.zeros(predictors.shape[0])
    predictors["rev_mean"] = np.zeros(predictors.shape[0])
    predictors["rev_std"] = np.zeros(predictors.shape[0])
for i in range(predictors.shape[0]):
    
    if predictors["id"].values[i] in revenues["id"].values: 
        curr_rev  = revenues.ix[revenues["id"]== predictors["id"].values[i],:]
        if predictors["device"].values[i] in curr_rev["device"].values:
            curr_rev = curr_rev.ix[curr_rev["device"] == predictors["device"].values[i],:]
            time_series = np.array(curr_rev.ix[:,5:61])[0]
            if -1 in time_series: continue 
            X = np.array(range(56))
            fit = np.polyfit(X,time_series,2)
            for k in range(3):  predictors["rev_coef_"+str(k)].values[i] = fit[2-k]            

## To csv



In [117]:
predictors.to_csv("predictors.csv")

In [127]:
predictors = pd.read_csv('predictors.csv').drop('Unnamed: 0', 1)

In [128]:
predictors.head()

Unnamed: 0,id,name,category,device,week_1,week_2,week_3,week_4,week_5,week_6,week_7,week_8,daily_avg,coef_0,coef_1,coef_2,coef_3,maxStep,minStep,std,nb_missing,iphone,ipad,Productivity,Entertainment,Travel,Sports,Music,Shopping,Finance,Business,Navigation,Food and Drink,Utilities,Newsstand,Health and Fitness,News,Lifestyle,Medical,Weather,Games,Catalogs,Social Networking,Photo and Video,Reference,Books,Education,chinese,Unnamed: 49,japanese,other,english,korean,cumulative_downloads_2015-02,avg_review,var_review,star1,star2,star3,star4,star5,num_ratings,reviewPerDailyDownloads,num_versions,num_review,days_since_release,downloads_per_day_before,positive,negative,m1_max,m1_min,m1_mean,m1_std,m1_coef_0,m1_coef_1,m1_coef_2,m2_max,m2_min,m2_mean,m2_std,m2_coef_0,m2_coef_1,m2_coef_2,m3_max,m3_min,m3_mean,m3_std,m3_coef_0,m3_coef_1,m3_coef_2,m4_max,m4_min,m4_mean,m4_std,m4_coef_0,m4_coef_1,m4_coef_2,rev_coef_0,rev_max,rev_min,rev_mean,rev_std,rev_coef_1,rev_coef_2
0,281704574,"AIM: Chat, Free Text, Photo Share, Voice Message",Social Networking,iphone,6.239161,6.127804,6.101279,6.183265,6.078625,5.926926,5.792578,6.059791,433.678571,433.678571,-2.585304,0.035204,0.003661,308,-233,91.952066,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,617121,3.2,2.684058,0.189152,0.08059,0.145992,0.139242,0.445023,469040.0,0.760044,3.0,70.0,2424.0,254.482887,0,1,0.0,0.0,0.0,0.0,0.0324,-0.001056,0.000199,0.0,0.0,0.0,0.0,0.214854,-0.000385,-8.4e-05,0.0,0.0,0.0,0.0,15.083152,-0.472173,0.100554,0.0,0.0,0.0,0.0,88130.375,-2675.89881,549.422619,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,281922769,Mobile MIM,Medical,ipad,4.075113,4.084775,3.964886,4.198275,4.089571,4.027899,4.124828,3.991626,58.678571,58.678571,-0.028161,-0.00504,-0.000418,28,-34,8.860239,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,83281,1.0,0.0,0.256757,0.131532,0.172072,0.142342,0.296396,1109.0,0.013316,1.0,1.0,2424.0,34.34268,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,281922769,Mobile MIM,Medical,iphone,4.576183,4.532599,4.418841,4.507715,4.480578,4.464265,4.439284,4.552121,89.851852,86.642857,-0.149146,0.010635,-0.000346,31,-20,19.594486,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,99000,1.0,0.0,0.256757,0.131532,0.172072,0.142342,0.296396,1109.0,0.011202,1.0,1.0,2424.0,40.824742,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,281935788,Epocrates References & Tools for Healthcare Pr...,Medical,ipad,5.61781,5.557379,5.520317,5.371302,5.493649,5.418637,5.434969,5.393628,239.660714,239.660714,-0.934279,0.019362,-0.000439,120,-73,31.992341,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,291496,2.666667,2.941176,0.281447,0.10309,0.140426,0.140326,0.33469,50353.0,0.172739,3.0,18.0,2424.0,120.204536,0,1,0.0,0.0,0.0,0.0,0.12898,-0.000505,-5.4e-05,0.0,0.0,0.0,0.0,0.6306,0.004964,-0.000612,0.0,0.0,0.0,0.0,20.453041,-0.239506,0.011242,0.0,0.0,0.0,0.0,350810.583333,-516.928571,-141.119048,415.328688,0.0,0.0,0.0,0.0,-0.732351,-0.035426
4,281935788,Epocrates References & Tools for Healthcare Pr...,Medical,iphone,7.038282,7.073875,7.173192,7.016097,7.020318,7.022613,7.028075,7.001246,1150.660714,1150.660714,-1.639337,-0.069742,0.006644,394,-276,154.055797,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,969059,2.666667,2.941176,0.281447,0.10309,0.140426,0.140326,0.33469,50353.0,0.051961,3.0,18.0,2424.0,399.611959,0,1,0.0,0.0,0.0,0.0,0.12898,-0.000505,-5.4e-05,0.0,0.0,0.0,0.0,0.6306,0.004964,-0.000612,0.0,0.0,0.0,0.0,20.453041,-0.239506,0.011242,0.0,0.0,0.0,0.0,350810.583333,-516.928571,-141.119048,4691.119653,0.0,0.0,0.0,0.0,195.635228,-4.040557


## How good did the predictor perform   --> Start running from here

In [129]:
#imported libraries
import pandas as pd
import numpy as np
import scipy as sp
import math
import matplotlib.pyplot as plt
import datetime
%matplotlib inline  
from sklearn import linear_model
from sklearn.feature_selection import RFE
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

#Creating of the input data
downloads = pd.read_csv('train_app_downloads.csv')
reviews = pd.read_csv('train_app_review.csv')
ratings = pd.read_csv('train_app_rating.csv')
usages = pd.read_csv('train_usage.csv')
revenues = pd.read_csv('train_revenue.csv')
output = pd.read_csv('train_final_downloads.csv')
dateRange = pd.date_range('2015-03-01', periods=56).format(formatter=lambda x: x.strftime('%Y-%m-%d'))

sentiment = pd.read_csv('sentiment.csv',header=-1).ix[:,0]
sentiment.values[sentiment.values==0.0] = 0.5
reviews["sentiment_score"] = sentiment.values 

#We map -1 to 0 in the downloads (there are no 0 in the initial data)
replacementValue=0
downloads = downloads.replace(-1,replacementValue)

#Minor corrections
ratings = ratings.rename(columns={'start1': 'star1'})
ratings = pd.merge(downloads.drop(dateRange,1), ratings.drop('Unnamed: 0', 1), how='left',
                   on=["id","name","category"]).replace("NaN",replacementValue)

In [130]:
#This is the metric we use to determine our performance
def metric(y_pred,y_test,percent=1):
    top = int(len(y_pred)/100.0*percent)
    return (len(set([i[0] for i in sorted(enumerate(y_pred), key=lambda x:x[1],reverse=True)][0:top])
       .intersection([i[0] for i in sorted(enumerate(y_test), key=lambda x:x[1],reverse=True)][0:top])
               ))/(percent/100.0)/len(y_pred)*100

## Predictor selection 2 methods. Cannot run both...

### Forward Predictor selection

In [131]:
predictors = pd.read_csv('predictors.csv').drop('Unnamed: 0', 1)
predictors = predictors.fillna(0)

In [132]:
#Use a model to do a forward recursive predictor selection
#mod=linear_model.Lasso(alpha=100,fit_intercept=False)
#mod=linear_model.LinearRegression(fit_intercept=False)
#mod= RandomForestRegressor(max_features = 1.0/3.0,n_estimators = 100) 
mod= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100) 
rfe = RFE(estimator=mod, n_features_to_select=1, step=1)
rfe.fit(predictors.as_matrix()[:,4:], output.as_matrix()[:,5])
ranking = rfe.ranking_

In [133]:
for e in [(i[0],i[1]) for i in sorted(zip(predictors.columns[4:],ranking),key=lambda x: x[1])]:
    print e

('coef_0', 1)
('cumulative_downloads_2015-02', 2)
('week_4', 3)
('week_2', 4)
('week_3', 5)
('week_6', 6)
('week_8', 7)
('week_7', 8)
('week_1', 9)
('daily_avg', 10)
('week_5', 11)
('downloads_per_day_before', 12)
('days_since_release', 13)
('num_review', 14)
('num_ratings', 15)
('coef_2', 16)
('coef_1', 17)
('minStep', 18)
('avg_review', 19)
('coef_3', 20)
('maxStep', 21)
('std', 22)
('reviewPerDailyDownloads', 23)
('star4', 24)
('var_review', 25)
('star2', 26)
('star3', 27)
('num_versions', 28)
('nb_missing', 29)
('Food and Drink', 30)
('star1', 31)
('star5', 32)
('m2_coef_0', 33)
('m4_coef_0', 34)
('Books', 35)
('m1_coef_2', 36)
('m2_coef_1', 37)
('rev_coef_1', 38)
('m3_coef_0', 39)
('chinese', 40)
('m3_coef_1', 41)
('ipad', 42)
('rev_coef_2', 43)
('rev_coef_0', 44)
('korean', 45)
('m4_coef_1', 46)
('m2_coef_2', 47)
('english', 48)
('m3_coef_2', 49)
('negative', 50)
('m1_coef_1', 51)
('iphone', 52)
('positive', 53)
('m1_coef_0', 54)
('m4_coef_2', 55)
('Utilities', 56)
('other', 57)


In [134]:
#manually drop predictors 
#predictors_to_drop = []
nb_pred_to_keep = 60
predictors_to_drop = [i[0] for i in sorted(zip(predictors.columns[4:],ranking),key=lambda x: x[1])][nb_pred_to_keep:]
for col in predictors_to_drop:
    predictors = predictors.drop(col,1)
    

### Other predictor selection method using Lasso

In [None]:
# Other method
predictors = pd.read_csv('predictors.csv').drop('Unnamed: 0', 1)
predictors = predictors.fillna(0)
#var_select = linear_model.Lasso(alpha = 0.01).fit(predictor_train_top_10_precent.as_matrix()[:,4:],np.log(output_train_top_10_precent["cumulative_downloads_2016-02"].as_matrix()))
var_select = linear_model.Lasso(alpha = 1).fit(predictor_train_top_10_precent.as_matrix()[:,4:],np.log(output_train_top_10_precent["cumulative_downloads_2016-02"].as_matrix()))

importance_list = [i for i in range(len(var_select.coef_)) if abs(var_select.coef_[i])>0]
important_predictors = [predictors.columns.values[i+4] for i in range(len(var_select.coef_)) if (i in importance_list)]
print important_predictors
for col in predictors.columns.values[4:]:
    if col not in important_predictors:
        predictors = predictors.drop(col,1)

In [None]:
predictors.columns.values

# Classification + Regression

In [136]:
print list(predictors.columns.values) 
np.random.seed(1)
K = 5

top_percent_classif = 10


kf = KFold(len(predictors), n_folds=K)
old_top = []
new_top = []
top_10 = []
new_top_select = [] 
new_top_noClassif = [] 
for train, test in kf:
    #base model
    old_mod=linear_model.LinearRegression(fit_intercept=False).fit(predictors.as_matrix()[train,4:12], output.as_matrix()[train,5])
    old_y_pred =  old_mod.predict(predictors.as_matrix()[test,4:12])
    old_top.append(metric(old_y_pred,output.as_matrix()[test,5]))
    
    #model to determine the top 10%   (CLASSIFICATION)
    mod_class10= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100).fit(predictors.as_matrix()[train,4:], output.as_matrix()[train,5])
    
    y_pred =  mod_class10.predict(predictors.as_matrix()[test,4:])
    
    #estimate of the top 10% of the test set
    estimate_class10 = predictors.iloc[test].copy()
    estimate_class10["firstEstimate"] = y_pred
    #estimate_class10 = estimate_class10.sort_values(by= "firstEstimate",ascending = False).iloc[1:int(1.0*top_percent_classif/100.0*len(estimate_class10))]
    estimate_class10 = estimate_class10.drop("firstEstimate",1)
    estimate_class10 = estimate_class10.sort_values(by= "daily_avg",ascending = False).iloc[1:int(1.0*top_percent_classif/100.0*len(estimate_class10))]

    
    #top 10% of the trainning set
    output_train_top_10_precent = output.iloc[train].copy().sort_values(by= 'cumulative_downloads_2016-02',ascending = False).iloc[1:int(1.0*top_percent_classif/100.0*len(output.iloc[train]))].drop('Unnamed: 0',1)
    predictor_train_top_10_precent = output_train_top_10_precent.merge(predictors, how='left', on=["id","name","category","device"]).copy()
    predictor_train_top_10_precent = predictor_train_top_10_precent.drop('cumulative_downloads_2016-02',1)
    #predictor_train_top_10_precent = predictor_train_top_10_precent.drop('firstEstimate',1)
    
    #This is the actual top 1% of the test set
    output_test_top_1_precent = output.iloc[test].sort_values(by= 'cumulative_downloads_2016-02',ascending = False).iloc[1:int(0.01*len(output.iloc[test]))].copy()

    
    #second model -> Regression on the top obtainned by regression
    #mod_top1= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100).fit(predictor_train_top_10_precent.as_matrix()[:,4:], output_train_top_10_precent["cumulative_downloads_2016-02"].as_matrix())
    mod_top1= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100).fit(predictor_train_top_10_precent.as_matrix()[:,4:], np.log(output_train_top_10_precent["cumulative_downloads_2016-02"].as_matrix()))
    #mod_top1=linear_model.Lasso(alpha=100,fit_intercept=False).fit(predictor_train_top_10_precent.as_matrix()[:,4:], np.log(output_train_top_10_precent["cumulative_downloads_2016-02"].as_matrix()))
    #mod_top1=GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0, loss='ls').fit(predictor_train_top_10_precent.as_matrix()[:,4:], (output_train_top_10_precent["cumulative_downloads_2016-02"].as_matrix()))
    
    y_pred_2 =  mod_top1.predict(estimate_class10.as_matrix()[:,4:])
    
    
    #Andrew's regression with additionnal lasso predictor selection
    #(1) feature selection : lasso or random forest
    #var_select = linear_model.Lasso(alpha = 0.01).fit(predictor_train_top_10_precent.as_matrix()[:,4:],np.log(output_train_top_10_precent["cumulative_downloads_2016-02"].as_matrix()))
    var_select = RandomForestRegressor(max_features = "sqrt",n_estimators = 100).fit(predictor_train_top_10_precent.as_matrix()[:,4:], np.log(output_train_top_10_precent["cumulative_downloads_2016-02"].as_matrix()))
      
    #importance_list = [i for i in  range(len(var_select.coef_)) if abs(var_select.coef_[i])>0]    
    importance_list = list(reversed(np.argsort(var_select.feature_importances_)))[0:20]
    
    #new train and test set with the selected variables
    X_rf_train = predictor_train_top_10_precent.as_matrix()[:,4:][:,importance_list]
    y_rf_train = output_train_top_10_precent["cumulative_downloads_2016-02"].as_matrix()
    X_rf_test = estimate_class10.as_matrix()[:,4:][:,importance_list]  
    
    #(2) regression: random forest or boosting 
    #mod_top1_select= RandomForestRegressor(max_features = "sqrt",n_estimators = 100).fit(X_rf_train, y_rf_train)
    params = {'n_estimators': 800, 'max_depth': 2, 'learning_rate': 0.015, 'loss': 'ls'}
    mod_top1_select= GradientBoostingRegressor(**params).fit(X_rf_train, y_rf_train)
    y_pred_3 = mod_top1_select.predict(X_rf_test)
    
    #No Classification model
    mod_noClassif= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100).fit(predictors.as_matrix()[train,4:], output.as_matrix()[train,5])
    y_pred_noClassif =  mod_noClassif.predict(predictors.as_matrix()[test,4:])

    
    
    
    
    estimate_top1 = estimate_class10.copy()
    estimate_top1_select = estimate_class10.copy()
    estimate_top1_noClassif = predictors.iloc[test].copy()
    estimate_top1["secondEstimate"] = y_pred_2
    estimate_top1_select["thirdEstimate"] = y_pred_3
    estimate_top1_noClassif["noClassifEstimate"] = y_pred_noClassif
    estimate_top1 = estimate_top1.sort_values(by= "secondEstimate",ascending = False).iloc[1:int(0.01*len(output.iloc[test]))]
    estimate_top1_select = estimate_top1_select.sort_values(by= "thirdEstimate",ascending = False).iloc[1:int(0.01*len(output.iloc[test]))]
    estimate_top1_noClassif = estimate_top1_noClassif.sort_values(by= "noClassifEstimate",ascending = False).iloc[1:int(0.01*len(output.iloc[test]))]

    estimation_error = len(estimate_top1.merge(output_test_top_1_precent, how='inner', on=["id","name","category","device"]))*100.0/len(output_test_top_1_precent)
    new_top.append(estimation_error)
    new_top_select.append(len(estimate_top1_select.merge(output_test_top_1_precent, how='inner', on=["id","name","category","device"]))*100.0/len(output_test_top_1_precent))
    new_top_noClassif.append(len(estimate_top1_noClassif.merge(output_test_top_1_precent, how='inner', on=["id","name","category","device"]))*100.0/len(output_test_top_1_precent))
    top_10.append(len(estimate_class10.merge(output_test_top_1_precent, how='inner', on=["id","name","category","device"]))*100.0/len(output_test_top_1_precent))
       
print "Old model           : " + str(1.0*sum(old_top)/len(old_top))
print "Top10%              : " + str(1.0*sum(top_10)/len(top_10))
print "Top1% with classif1 : " + str(1.0*sum(new_top)/len(new_top))
print "Top1% with classif2 : " + str(1.0*sum(new_top_select)/len(new_top_select))
print "Top1% no classif1   : " + str(1.0*sum(new_top_noClassif)/len(new_top_noClassif))

['id', 'name', 'category', 'device', 'week_1', 'week_2', 'week_3', 'week_4', 'week_5', 'week_6', 'week_7', 'week_8', 'daily_avg', 'coef_0', 'coef_1', 'coef_2', 'coef_3', 'maxStep', 'minStep', 'std', 'nb_missing', 'iphone', 'ipad', 'Entertainment', 'Food and Drink', 'Utilities', 'Social Networking', 'Photo and Video', 'Books', 'chinese', 'other', 'english', 'korean', 'cumulative_downloads_2015-02', 'avg_review', 'var_review', 'star1', 'star2', 'star3', 'star4', 'star5', 'num_ratings', 'reviewPerDailyDownloads', 'num_versions', 'num_review', 'days_since_release', 'downloads_per_day_before', 'positive', 'negative', 'm1_coef_0', 'm1_coef_1', 'm1_coef_2', 'm2_coef_0', 'm2_coef_1', 'm2_coef_2', 'm3_coef_0', 'm3_coef_1', 'm3_coef_2', 'm4_coef_0', 'm4_coef_1', 'm4_coef_2', 'rev_coef_0', 'rev_coef_1', 'rev_coef_2']
Old model           : 54.4232031342
Top10%              : 97.4603174603
Top1% with classif1 : 64.126984127
Top1% with classif2 : 64.4444444444
Top1% no classif1   : 63.4920634921


In [None]:
var_select.feature_importances_

In [None]:
len(predictor_train_top_10_precent)

# Old Testing model

In [None]:
old_top = []
new_top = []

test_frac = 0.31  #Fraction of test points
N = 20   #number of iterations
np.random.seed(1)
for i in range(1,N):
    r = np.random.randint(1,429496729)
    X_train, X_test, y_train, y_test = train_test_split(predictors.as_matrix()[:,4:12], output.as_matrix()[:,5], test_size=test_frac, random_state=r)
    X_test = X_test[0:10000]
    y_test = y_test[0:10000]
    old_mod=linear_model.LinearRegression(fit_intercept=False).fit(X_train,y_train)
    old_y_pred =  old_mod.predict(X_test)
    old_top.append(metric(old_y_pred,y_test))
    
np.random.seed(1)
for i in range(1,N):
    r = np.random.randint(1,429496729)
    X_train, X_test, y_train, y_test = train_test_split(predictors.as_matrix()[:,4:], output.as_matrix()[:,5], test_size=test_frac, random_state=r)
    X_test = X_test[0:10000]
    y_test = y_test[0:10000]
    #mod=linear_model.LinearRegression(fit_intercept=False).fit(X_train,y_train)
    #mod=linear_model.Lasso(alpha=100,fit_intercept=False).fit(X_train,y_train)
    #mod= RandomForestRegressor(max_features = 1.0/3.0,n_estimators = 100).fit(X_train,y_train)
    mod= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100).fit(X_train,y_train)
    y_pred =  mod.predict(X_test)
    new_top.append(metric(y_pred,y_test))
if (N<300):
    fig = plt.figure()
    plt.plot(range(1,N),old_top,label="old")
    plt.plot(range(1,N),new_top,label="performance")
    plt.legend(loc='best')
    plt.show()

print 1.0*sum(old_top)/len(old_top)
print list(predictors.columns.values)
print 1.0*sum(new_top)/len(new_top)

54.8947368421  
55.0526315789  #with #missing and Lasso <br/>
54.7894736842  #including raw ratings Lasso :( <br/>
55.0526315789  #with #missing weightedSumRatings and Lasso <br/>
55.1052631579  #adding the categories <br/>
55.3684210526  #adding average sentiment score and positive/negative label <br/>
55.5263157895  #adding coeficients of usages


With 10000 only:
54.8947368421 all

56.5263157895 on Lasso all but 'rev_coef_i'

Lasso All alpha = 100
['id', 'name', 'category', 'device', 'week_1', 'week_2', 'week_3', 'week_4', 'week_5', 'week_6', 'week_7', 'week_8', 'daily_avg', 'coef_0', 'coef_1', 'coef_2', 'coef_3', 'maxStep', 'minStep', 'std', 'nb_missing', 'iphone', 'ipad', 'Productivity', 'Entertainment', 'Travel', 'Sports', 'Music', 'Shopping', 'Finance', 'Business', 'Navigation', 'Food and Drink', 'Utilities', 'Newsstand', 'Health and Fitness', 'News', 'Lifestyle', 'Medical', 'Weather', 'Games', 'Catalogs', 'Social Networking', 'Photo and Video', 'Reference', 'Books', 'Education', 'avg_review', 'var_review', 'star1', 'star2', 'star3', 'star4', 'star5', 'positive', 'negative', 'm1_coef_0', 'm1_coef_1', 'm1_coef_2', 'm2_coef_0', 'm2_coef_1', 'm2_coef_2', 'm3_coef_0', 'm3_coef_1', 'm3_coef_2', 'm4_coef_0', 'm4_coef_1', 'm4_coef_2']
57.1052631579

Predictor selection, pick the top 50 features selected in the forward recursive selection with Lasso and then do random forest

54.6842105263
['id', 'name', 'category', 'device', 'week_1', 'week_2', 'week_3', 'week_4', 'week_5', 'week_6', 'week_7', 'week_8', 'daily_avg', 'coef_0', 'coef_1', 'coef_2', 'coef_3', 'maxStep', 'minStep', 'std', 'nb_missing', 'iphone', 'ipad', 'Productivity', 'Entertainment', 'Sports', 'Music', 'Shopping', 'Finance', 'Business', 'Navigation', 'Food and Drink', 'Utilities', 'News', 'Lifestyle', 'Medical', 'Weather', 'Games', 'Catalogs', 'Social Networking', 'Photo and Video', 'Reference', 'Education', 'avg_review', 'var_review', 'star2', 'star3', 'star4', 'positive', 'negative', 'm2_coef_0', 'm3_coef_0', 'm3_coef_1', 'm4_coef_2']
57.3157894737

In [None]:
#Estimation of alpha for the Lasso regression "cros validation" like approach
for a in np.arange(90,110,5):

    old_top = []
    new_top = []

    test_frac = 0.31  #Fraction of test points
    N = 20   #number of iterations


    np.random.seed(1)
    for i in range(1,N):
        r = np.random.randint(1,429496729)
        X_train, X_test, y_train, y_test = train_test_split(predictors.as_matrix()[:,4:], output.as_matrix()[:,5], test_size=test_frac, random_state=r)
        X_test = X_test[0:10000]
        y_test = y_test[0:10000]
        mod=linear_model.Lasso(alpha=a,fit_intercept=False).fit(X_train,y_train) 
        y_pred =  mod.predict(X_test)
        new_top.append(metric(y_pred,y_test))


    print a
    print 1.0*sum(new_top)/len(new_top)
    print " "

## Cross Validation k-folds

In [None]:
K = 10

kf = KFold(len(predictors), n_folds=K)
old_top = []
new_top = []
for train, test in kf:
    #base model
    old_mod=linear_model.LinearRegression(fit_intercept=False).fit(predictors.as_matrix()[train,4:12], output.as_matrix()[train,5])
    old_y_pred =  old_mod.predict(predictors.as_matrix()[test,4:12])
    old_top.append(metric(old_y_pred,output.as_matrix()[test,5]))
    #model
    #mod=linear_model.LinearRegression(fit_intercept=False).fit(predictors.as_matrix()[train,4:], output.as_matrix()[train,5]) 
    #mod=linear_model.Lasso(alpha=100,fit_intercept=False).fit(predictors.as_matrix()[train,4:], output.as_matrix()[train,5]) 
    #mod = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0, loss='ls').fit(predictors.as_matrix()[train,4:], output.as_matrix()[train,5]) 
    #mod= RandomForestRegressor(max_features = 1.0/3.0,n_estimators = 100).fit(predictors.as_matrix()[train,4:], output.as_matrix()[train,5])
    mod= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100).fit(predictors.as_matrix()[train,4:], output.as_matrix()[train,5])
    y_pred =  mod.predict(predictors.as_matrix()[test,4:])
    new_top.append(metric(y_pred,output.as_matrix()[test,5]))
print 1.0*sum(old_top)/len(old_top)
print list(predictors.columns.values)
print 1.0*sum(new_top)/len(new_top)

K=10 
Random Forest sqrt  :mod= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100).fit(predictors.as_matrix()[train,4:], output.as_matrix()[train,5])
PPredictors obtainned with top 50 on Lasso
['id', 'name', 'category', 'device', 'week_1', 'week_2', 'week_3', 'week_4', 'week_5', 'week_6', 'week_7', 'week_8', 'coef_0', 'coef_1', 'coef_2', 'coef_3', 'minStep', 'std', 'nb_missing', 'iphone', 'ipad', 'Productivity', 'Entertainment', 'Travel', 'Sports', 'Music', 'Shopping', 'Finance', 'Business', 'Navigation', 'Utilities', 'News', 'Lifestyle', 'Medical', 'Weather', 'Games', 'Catalogs', 'Social Networking', 'Photo and Video', 'Reference', 'Books', 'Education', 'avg_review', 'var_review', 'star2', 'star3', 'star4', 'positive', 'negative', 'm2_coef_0', 'm3_coef_0', 'm3_coef_1', 'rev_coef_0', 'rev_coef_1']
58.1337784952

# Lasso all
53.4375
['id', 'name', 'category', 'device', 'week_1', 'week_2', 'week_3', 'week_4', 'week_5', 'week_6', 'week_7', 'week_8', 'daily_avg', 'coef_0', 'coef_1', 'coef_2', 'coef_3', 'maxStep', 'minStep', 'std', 'nb_missing', 'iphone', 'ipad', 'Productivity', 'Entertainment', 'Travel', 'Sports', 'Music', 'Shopping', 'Finance', 'Business', 'Navigation', 'Food and Drink', 'Utilities', 'Newsstand', 'Health and Fitness', 'News', 'Lifestyle', 'Medical', 'Weather', 'Games', 'Catalogs', 'Social Networking', 'Photo and Video', 'Reference', 'Books', 'Education', 'avg_review', 'var_review', 'star1', 'star2', 'star3', 'star4', 'star5', 'positive', 'negative', 'm1_coef_0', 'm1_coef_1', 'm1_coef_2', 'm2_coef_0', 'm2_coef_1', 'm2_coef_2', 'm3_coef_0', 'm3_coef_1', 'm3_coef_2', 'm4_coef_0', 'm4_coef_1', 'm4_coef_2']
54.0625

## Attempt at boosting

In [None]:
#Boosting

from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_friedman1
from sklearn.ensemble import GradientBoostingRegressor

new_top = []

test_frac = 0.31  #Fraction of test points
N = 20   #number of iterations



np.random.seed(1)
for i in range(1,N):
    r = np.random.randint(1,429496729)
    X_train, X_test, y_train, y_test = train_test_split(predictors.as_matrix()[:,4:], output.as_matrix()[:,5], test_size=test_frac, random_state=r)
    X_test = X_test[0:10000]
    y_test = y_test[0:10000]
    #mod=linear_model.LinearRegression(fit_intercept=False).fit(X_train,y_train)
    #mod=linear_model.Lasso(alpha=a,fit_intercept=False).fit(X_train,y_train) 
    mod = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0, loss='ls').fit(X_train,y_train).fit(X_train,y_train) 
    mod= RandomForestRegressor(max_features = 1.0/3.0,n_estimators = 100) 
    y_pred =  mod.predict(X_test)
    new_top.append(metric(y_pred,y_test))

print 1.0*sum(new_top)/len(new_top)
print " "


# CV on the size of the Classification top

In [None]:
print list(predictors.columns.values) 
np.random.seed(1)
K = 5

cv_top_10 = []
cv_top_1  = []

for top in range(1,31):

    top_percent_classif = top

    kf = KFold(len(predictors), n_folds=K)
    old_top = []
    new_top = []
    top_10 = []
    new_top_select = [] 
    for train, test in kf:
        #base model
        old_mod=linear_model.LinearRegression(fit_intercept=False).fit(predictors.as_matrix()[train,4:12], output.as_matrix()[train,5])
        old_y_pred =  old_mod.predict(predictors.as_matrix()[test,4:12])
        old_top.append(metric(old_y_pred,output.as_matrix()[test,5]))

        #model to determine the top 10%   (CLASSIFICATION)
        mod_class10= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100).fit(predictors.as_matrix()[train,4:], output.as_matrix()[train,5])

        y_pred =  mod_class10.predict(predictors.as_matrix()[test,4:])

        #estimate of the top 10% of the test set
        estimate_class10 = predictors.iloc[test].copy()
        estimate_class10["firstEstimate"] = y_pred
        #estimate_class10 = estimate_class10.sort_values(by= "firstEstimate",ascending = False).iloc[1:int(1.0*top_percent_classif/100.0*len(estimate_class10))]
        estimate_class10 = estimate_class10.drop("firstEstimate",1)
        estimate_class10 = estimate_class10.sort_values(by= "daily_avg",ascending = False).iloc[1:int(1.0*top_percent_classif/100.0*len(estimate_class10))]


        #top 10% of the trainning set
        output_train_top_10_precent = output.iloc[train].copy().sort_values(by= 'cumulative_downloads_2016-02',ascending = False).iloc[1:int(1.0*top_percent_classif/100.0*len(output.iloc[train]))].drop('Unnamed: 0',1)
        predictor_train_top_10_precent = output_train_top_10_precent.merge(predictors, how='left', on=["id","name","category","device"]).copy()
        predictor_train_top_10_precent = predictor_train_top_10_precent.drop('cumulative_downloads_2016-02',1)
        #predictor_train_top_10_precent = predictor_train_top_10_precent.drop('firstEstimate',1)

        #This is the actual top 1% of the test set
        output_test_top_1_precent = output.iloc[test].sort_values(by= 'cumulative_downloads_2016-02',ascending = False).iloc[1:int(0.01*len(output.iloc[test]))].copy()


        #second model -> Regression on the top obtainned by regression
        #mod_top1= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100).fit(predictor_train_top_10_precent.as_matrix()[:,4:], output_train_top_10_precent["cumulative_downloads_2016-02"].as_matrix())
        mod_top1= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100).fit(predictor_train_top_10_precent.as_matrix()[:,4:], np.log(output_train_top_10_precent["cumulative_downloads_2016-02"].as_matrix()))
        #mod_top1=linear_model.Lasso(alpha=100,fit_intercept=False).fit(predictor_train_top_10_precent.as_matrix()[:,4:], np.log(output_train_top_10_precent["cumulative_downloads_2016-02"].as_matrix()))
        #mod_top1=GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0, loss='ls').fit(predictor_train_top_10_precent.as_matrix()[:,4:], (output_train_top_10_precent["cumulative_downloads_2016-02"].as_matrix()))

        y_pred_2 =  mod_top1.predict(estimate_class10.as_matrix()[:,4:])

        estimate_top1 = estimate_class10.copy()
        estimate_top1["secondEstimate"] = y_pred_2

        estimate_top1 = estimate_top1.sort_values(by= "secondEstimate",ascending = False).iloc[1:int(0.01*len(output.iloc[test]))]

        estimation_error = len(estimate_top1.merge(output_test_top_1_precent, how='inner', on=["id","name","category","device"]))*100.0/len(output_test_top_1_precent)
        new_top.append(estimation_error)
        top_10.append(len(estimate_class10.merge(output_test_top_1_precent, how='inner', on=["id","name","category","device"]))*100.0/len(output_test_top_1_precent))


    cv_top_10.append(1.0*sum(top_10)/len(top_10))
    cv_top_1.append(1.0*sum(new_top)/len(new_top))

plt.plot(range(1,31),cv_top_10,label = 'top 10 estimate')
plt.plot(range(1,31),cv_top_1,label = 'top 1 estimate')
plt.legend(loc = 'best')




In [None]:
plt.plot(range(1,31),cv_top_10,label = 'top 10 estimate')
plt.plot(range(1,31),cv_top_1,label = 'top 1 estimate')
plt.legend(loc = 'best')


In [None]:
output_scaled.sort_values(by= 'cumulative_downloads_2016-02',ascending = False).head(1)


In [None]:
predictors.columns


# Making Classification on top 10%

In [None]:
predictors = pd.read_csv('predictors.csv').drop('Unnamed: 0', 1)
predictors = predictors.fillna(0)

thr = predictors.sort_values(by= 'week_8',ascending = False).iloc[int(0.1*len(predictors))]["week_8"]
predictors  = predictors[predictors.week_8 >= thr]


len(predictors)

In [None]:
output_classification_10 = output
output_classification_50 = output
output_classification_100 = output
#Is top ?
def generator_istop(inp,threshold):
    if (inp[5] >= threshold):
        return 'True'
    else:
        return 'False'

threshold = output.sort_values('cumulative_downloads_2016-02',ascending = False).iloc[len(output)/10]['cumulative_downloads_2016-02']
output_classification_10["is_top"] = output.apply(generator_istop,axis = 1,args=(threshold,))

threshold = output.sort_values('cumulative_downloads_2016-02',ascending = False).iloc[len(output)/50]['cumulative_downloads_2016-02']
output_classification_50["is_top"] = output.apply(generator_istop,axis = 1,args=(threshold,))

threshold = output.sort_values('cumulative_downloads_2016-02',ascending = False).iloc[len(output)/100]['cumulative_downloads_2016-02']
output_classification_100["is_top"] = output.apply(generator_istop,axis = 1,args=(threshold,))

In [None]:
output_classification_10.head()

In [None]:
from sklearn.svm import SVC
K=10
kf = KFold(32339, n_folds=K)
old_top = []
new_top = []
for train, test in kf:
    #model
    mod = SVC(C=10.0, kernel='poly', degree=3, gamma='auto', coef0=2.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=10, decision_function_shape=None, random_state=None).fit(predictors.as_matrix()[train,4:],output_classification_10.as_matrix()[train,6])
    #mod = linear_model.LogisticRegression().fit(predictors.as_matrix()[train,4:], output_classification_10.as_matrix()[train,6])
    y_pred =  mod.predict(predictors.as_matrix()[test,4:])
    new_top.append(metric_classification(y_pred,output_classification_10.as_matrix()[test,6]))
print list(predictors.columns.values)
print 1.0*sum(new_top)/len(new_top)


In [None]:
###This metric is not very good, put True everywhere and get 100%
def metric_classification(y_pred,y_test):
    nb_top = 0
    for i in range(len(y_test)):
        if (y_pred[i]=='True' and y_test[i]=="False"):
            nb_top+=1
    return 100.0*nb_top/len(y_pred)


In [None]:
list(y_pred).count("False")*1.0/len(y_pred)

In [None]:
print list(y_pred).count('True')
print list(output_classification_10.as_matrix()[test,6]).count('True')
print len(y_pred)

Question, what if instead of using the 'cumulative_downloads_2016-02' we used exp('cumulative_downloads_2016-02') to try to shrunk the lowest points?


In [None]:
output_scaled = output
#Is top ?
def generator_outputScaler(inp):
    #return math.exp(inp[5]*1.0/12329752*100)   #28%
    #return 1.0/(20000000-inp[5])   #0.618429189858
    #return inp[5]**2       #53.8045828797
    #return (inp[5]*1.0/20000000)**2*inp[5]     #46.3834326015
    #return (inp[5]*1.0/1500000)**0.5   #  56.5875142341
    #return 100.0/(1+math.exp(-(inp[5]-1500000)*1.0/100000))    #54.4230120696
    #return 200.0/(1+math.exp(-(inp[5]-1500000)*1.0/1000000)) - 100    #55.9691806875
    #return math.log(inp[5])      #58.7522076851
    return inp[5]               #58.4429930902
    #return 200.0/(1+math.exp(-(inp[5]*1.0/1500000-1)*10)) - 100       #55.3506558544
    #return math.exp(1.0/(1+math.exp(-(inp[5]*1.0/1500000-1)*10)))     #


output_scaled["scaled_downloads"] = output.apply(generator_outputScaler,axis = 1)

#output_scaled["scaled_downloads"] =(output['cumulative_downloads_2016-02']-output['cumulative_downloads_2016-02'].mean())/output['cumulative_downloads_2016-02'].std()

In [None]:
output_scaled.sort_values(by= 'cumulative_downloads_2016-02',ascending = False).head(320)

In [None]:
plt.plot(list(output_scaled.sort_values(by= 'cumulative_downloads_2016-02',ascending = False)["scaled_downloads"]))
plt.axis([0, 300,min(list(output_scaled.sort_values(by= 'cumulative_downloads_2016-02',ascending = False)["scaled_downloads"])),max(list(output_scaled.sort_values(by= 'cumulative_downloads_2016-02',ascending = False)["scaled_downloads"]))])

In [None]:
K = 10

kf = KFold(32339, n_folds=K)
old_top = []
new_top = []
for train, test in kf:
    #base model
    old_mod=linear_model.LinearRegression(fit_intercept=False).fit(predictors.as_matrix()[train,4:12], output.as_matrix()[train,5])
    old_y_pred =  old_mod.predict(predictors.as_matrix()[test,4:12])
    old_top.append(metric(old_y_pred,output.as_matrix()[test,5]))
    #model
    #mod=linear_model.LinearRegression(fit_intercept=False).fit(predictors.as_matrix()[train,4:], output_scaled.as_matrix()[train,6])
    #mod=linear_model.Lasso(alpha=100,fit_intercept=False).fit(predictors.as_matrix()[train,4:], output_scaled.as_matrix()[train,6])
    #mod = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0, loss='ls').fit(predictors.as_matrix()[train,4:], output_scaled.as_matrix()[train,6]) 
    #mod= RandomForestRegressor(max_features = 1.0/3.0,n_estimators = 100).fit(predictors.as_matrix()[train,4:], output_scaled.as_matrix()[train,6])
    mod= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100).fit(predictors.as_matrix()[train,4:], output_scaled.as_matrix()[train,6])
    y_pred =  mod.predict(predictors.as_matrix()[test,4:])
    new_top.append(metric(y_pred,output_scaled.as_matrix()[test,6]))
print 1.0*sum(old_top)/len(old_top)
print list(predictors.columns.values)
print 1.0*sum(new_top)/len(new_top)

In [None]:
plt.plot(list(output.sort_values(by= 'cumulative_downloads_2016-02',ascending = False)["cumulative_downloads_2016-02"]))
plt.axis([0, 3200, 0, 13000000])

In [None]:
plt.plot(list(output_scaled.sort_values(by= 'cumulative_downloads_2016-02',ascending = False)["scaled_downloads"]))


In [90]:
reviews

Unnamed: 0,id,name,name.1,rating,date,title,version,text,reviewer,sentiment_score
0,281704574,"AIM: Chat, Free Text, Photo Share, Voice Message",United States,1,2015-03-04,Gets the job done but needs improvement,6.0.6,Gets the job done but needs improvement. Still...,bleekoutlook,0.208647
1,281704574,"AIM: Chat, Free Text, Photo Share, Voice Message",United States,1,2015-03-10,Not a fan...,6.0.6,Constantly shuts down on me or will double my ...,nyr8er,0.027757
2,281704574,"AIM: Chat, Free Text, Photo Share, Voice Message",United States,1,2015-03-11,boo,6.0.6,"Shuts down a lot , it needs a new theme and we...",Makiaric,0.116763
3,281704574,"AIM: Chat, Free Text, Photo Share, Voice Message",United States,1,2015-03-13,Keeps crashing.,6.0.6,"Whenever I try starting the app, it allows me ...",CrudexVendetta,0.074171
4,281704574,"AIM: Chat, Free Text, Photo Share, Voice Message",United States,1,2015-03-13,Was good but not anymore,6.0.6,"Won't even let me sign in, and when I enter my...",Jdeguilmo08,0.026922
5,281704574,"AIM: Chat, Free Text, Photo Share, Voice Message",United States,1,2015-03-15,Mic won't work,6.0.6,"Since updating my iPhone 6 OS, can't use the ...",XxdldxX,0.003427
6,281704574,"AIM: Chat, Free Text, Photo Share, Voice Message",United States,1,2015-03-18,Bad,6.0.6,Confusing and awful. Where s the old one!?,donnyf68,0.001292
7,281704574,"AIM: Chat, Free Text, Photo Share, Voice Message",United States,1,2015-03-19,change design,6.0.6,clearly no one wants aim just to chat because ...,jaksvdnsn,0.724061
8,281704574,"AIM: Chat, Free Text, Photo Share, Voice Message",United States,1,2015-03-19,The new version has lost its functionality,6.0.6,I have used aim for four years now. The new v...,Sci Mars,0.000186
9,281704574,"AIM: Chat, Free Text, Photo Share, Voice Message",United States,1,2015-03-24,Can't sign up,6.0.6,Never sends confirmation email so cannot use. ...,Grizzlymomof3,6.3e-05


In [92]:
predictors

Unnamed: 0,id,name,category,device,week_1,week_2,week_3,week_4,week_5,week_6,week_7,week_8,daily_avg,coef_0,coef_1,coef_2,coef_3,maxStep,minStep,std,nb_missing,iphone,ipad,Shopping,Finance,Business,Social Networking,Medical,Utilities,News,Education,Navigation,Travel,other,english,cumulative_downloads_2015-02,avg_review,var_review,star1,star2,star3,star4,star5,num_ratings,reviewPerDailyDownloads,num_versions,num_review,days_since_release,downloads_per_day_before,positive,negative,avg_sentiment_score
0,281704574,"AIM: Chat, Free Text, Photo Share, Voice Message",Social Networking,iphone,6.239161,6.127804,6.101279,6.183265,6.078625,5.926926,5.792578,6.059791,433.678571,433.678571,-2.585304,0.035204,0.003661,308,-233,91.952066,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,617121,1.5,0.603448,0.189152,0.08059,0.145992,0.139242,0.445023,469040.0,0.760044,3.0,30.0,2424,254.482887,0,1,0.12951
1,281922769,Mobile MIM,Medical,ipad,4.075113,4.084775,3.964886,4.198275,4.089571,4.027899,4.124828,3.991626,58.678571,58.678571,-0.028161,-0.00504,-0.000418,28,-34,8.860239,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,83281,0.0,0.0,0.256757,0.131532,0.172072,0.142342,0.296396,1109.0,0.013316,0.0,0.0,2424,34.34268,0,0,
2,281922769,Mobile MIM,Medical,iphone,4.576183,4.532599,4.418841,4.507715,4.480578,4.464265,4.439284,4.552121,89.851852,86.642857,-0.149146,0.010635,-0.000346,31,-20,19.594486,2,1,0,0,0,0,0,1,0,0,0,0,0,1,0,99000,0.0,0.0,0.256757,0.131532,0.172072,0.142342,0.296396,1109.0,0.011202,0.0,0.0,2424,40.824742,0,0,
3,281935788,Epocrates References & Tools for Healthcare Pr...,Medical,ipad,5.61781,5.557379,5.520317,5.371302,5.493649,5.418637,5.434969,5.393628,239.660714,239.660714,-0.934279,0.019362,-0.000439,120,-73,31.992341,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,291496,0.0,0.0,0.281447,0.10309,0.140426,0.140326,0.33469,50353.0,0.172739,0.0,0.0,2424,120.204536,0,0,
4,281935788,Epocrates References & Tools for Healthcare Pr...,Medical,iphone,7.038282,7.073875,7.173192,7.016097,7.020318,7.022613,7.028075,7.001246,1150.660714,1150.660714,-1.639337,-0.069742,0.006644,394,-276,154.055797,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,969059,0.0,0.0,0.281447,0.10309,0.140426,0.140326,0.33469,50353.0,0.051961,0.0,0.0,2424,399.611959,0,0,
5,284147312,iTrans DC Metro,Navigation,iphone,6.340107,6.486814,6.627474,6.550468,6.656359,6.727432,6.583409,6.43935,705.017857,705.017857,1.771258,-0.272778,-0.007031,302,-286,125.918978,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,688766,0.0,0.0,0.123731,0.077411,0.105964,0.196701,0.495558,1575.0,0.002287,0.0,0.0,2424,284.027216,0,0,
6,284574017,Units - Free Unit Converter,Utilities,ipad,4.513994,4.163337,4.129436,4.223701,4.169982,4.313671,4.30984,5.192163,84.964286,84.964286,1.199111,0.112912,0.003346,115,-124,45.913336,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,174578,0.0,0.0,0.209694,0.121846,0.228126,0.186376,0.253938,48503.0,0.277828,0.0,0.0,2391,72.984114,0,0,
7,284574017,Units - Free Unit Converter,Utilities,iphone,6.052089,5.997695,6.167516,6.042633,0.0,0.0,0.0,6.57902,572.333333,122.642857,5.276897,0.493237,0.037758,262,-162,247.844596,44,1,0,0,0,0,0,0,1,0,0,0,0,0,1,298172,0.0,0.0,0.209694,0.121846,0.228126,0.186376,0.253938,48503.0,0.162667,0.0,0.0,2391,124.653846,0,0,
8,284735786,Compass Free,Utilities,ipad,3.579543,3.695997,3.797093,3.925268,3.909162,3.703064,3.839452,4.032976,45.481481,43.857143,0.257553,-0.000722,0.000748,67,-63,19.078249,2,0,1,0,0,0,0,0,1,0,0,0,0,0,1,109644,0.0,0.0,0.693822,0.121501,0.06966,0.027874,0.087138,163809.0,1.493994,0.0,0.0,2407,45.533223,0,0,
9,284776127,"Shopper - Grocery List, Shopping List and Recipes",Shopping,ipad,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0,0,0.0,56,0,1,1,0,0,0,0,0,0,0,0,0,0,1,578,0.0,0.0,0.278923,0.158691,0.184091,0.159646,0.218568,12558.0,21.689119,0.0,0.0,2424,0.238351,0,0,


In [None]:
predictors.sort_values(by= 'week_8',ascending = False)[0:3233].head(3)

In [None]:
output.sort_values(by= 'cumulative_downloads_2016-02',ascending = False)[0:323].head(3)

In [None]:
len(predictors.sort_values(by= 'week_8',ascending = False)[0:3233].merge(output.sort_values(by= 'cumulative_downloads_2016-02',ascending = False)[0:323], how='inner', on=["id","name","category","device"]))

In [70]:
predictors = predictors[:30]
downloads = downloads[:30]
reviews = reviews[:30]
ratings = ratings[:30]
usages = usages[:30]
revenues = revenues[:30]
output =  output[:30]
dateRange = dateRange[:30]

In [41]:
reviews = reviews[:3000]

In [None]:



estimate_top1 = estimate_class10.copy()
estimate_top1["secondEstimate"] = np.exp(y_pred_2).astype(int)
estimate_top1 = estimate_top1.merge(output, how='left', on=["id","name","category","device"]).copy()
estimate_top1 = estimate_top1.drop("Unnamed: 0",1)
estimate_top1.sort_values(by= 'cumulative_downloads_2016-02',ascending = False)
