### Use this markdown to generate the predictors


In [None]:
#imported libraries
import pandas as pd
import numpy as np
import scipy as sp
import math
import matplotlib.pyplot as plt
import langdetect
import datetime
%matplotlib inline  
from sklearn.svm import SVR
from sklearn import linear_model
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import Imputer

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

#Creating of the input data
downloads = pd.read_csv('train_app_downloads.csv')
reviews = pd.read_csv('train_app_review.csv')
ratings = pd.read_csv('train_app_rating.csv')
usages = pd.read_csv('train_usage.csv')
revenues = pd.read_csv('train_revenue.csv')
output = pd.read_csv('train_final_downloads.csv')
prev_downloads = pd.read_csv('train_cumulative_downloads_2015-02.csv').drop('Unnamed: 0', 1)
dateRange = pd.date_range('2015-03-01', periods=56).format(formatter=lambda x: x.strftime('%Y-%m-%d'))
sentiment = pd.read_csv('sentiment.csv',header=-1).ix[:,0]
sentiment.values[sentiment.values==0.0] = 0.5
reviews["sentiment_score"] = sentiment.values 

#We map -1 to 0 in the downloads (there are no 0 in the initial data)
replacementValue=0
#downloads = downloads.replace(-1,replacementValue)
imputer = Imputer(missing_values=-1, strategy='median', axis=0)
name = downloads.columns
downloads = pd.concat([downloads.ix[:,0:5],pd.DataFrame(imputer.fit_transform(downloads.ix[:,5:]))],axis = 1)
downloads.columns = name 

name = usages.columns
usages = pd.concat([usages.ix[:,0:6],pd.DataFrame(imputer.fit_transform(usages.ix[:,6:]))],axis = 1)
usages.columns = name 

name = revenues.columns
revenues = pd.concat([revenues.ix[:,0:5],pd.DataFrame(imputer.fit_transform(revenues.ix[:,5:]))],axis = 1)
revenues.columns = name 



#Minor corrections
ratings = ratings.rename(columns={'start1': 'star1'})
ratings = pd.merge(downloads.drop(dateRange,1), ratings.drop('Unnamed: 0', 1), how='left',
                   on=["id","name","category"]).replace("NaN",replacementValue)

#### Initialize the predictors matrix

In [None]:
predictors = pd.concat([downloads["id"],downloads["name"],downloads["category"],downloads["device"]],
                       axis=1,keys=["id","name","category","device"])
predictors.head()

#### Use the train_app_downloads.csv file

In [None]:
#Generate the logWeekly average
def generator_weekAvg(inp,w):
    if (np.count_nonzero(inp[5+w*7:12+w*7] - replacementValue*np.ones(len(inp[5+w*7:12+w*7]))) == 0):
        return 0
    return  math.log(1.0*sum(inp[5+w*7:12+w*7])/np.count_nonzero(inp[5+w*7:12+w*7] 
                                                                 - replacementValue*np.ones(len(inp[5+w*7:12+w*7]))))

for w in range(8):
    predictors["week_"+str(w+1)] = downloads.apply(generator_weekAvg,axis=1,args=(w,))

In [None]:
#Generate the daily average
def generator_dailyAvg(inp):
    if (np.count_nonzero(inp[5:] - replacementValue*np.ones(len(inp[5:]))) == 0):
        return 0
    return  (1.0*sum(inp[5:])/np.count_nonzero(inp[5:] - replacementValue*np.ones(len(inp[5:]))))
    #return  math.log(1.0*sum(inp[5:])/len(inp[5:]))

predictors["daily_avg"] = downloads.apply(generator_dailyAvg,axis=1)
predictors["download_sum"] = downloads.ix[:,5:].sum(axis=1)
#This one is bad

In [None]:
#Generate the polynomial coefficients
def generator_coef(inp, coef):
    return  np.polyfit(range(56),np.cumsum(inp[5:]),coef)[0]
#Redo by ignoring the -1 / 0 ? w = [1110011]     w = np.not_equal(inp[5:],np.ones(len(inp[5:]))*replacementValue

for c in range(4):
    predictors["coef_"+str(c)] = downloads.apply(generator_coef,axis=1,args=(c,))

In [None]:
#Generate the step max and min (we ignore the values of 0)
def generator_maxStep(inp,maximum):
    if (np.count_nonzero(inp[5:] - replacementValue*np.ones(len(inp[5:]))) == 0):
        return 0
    m = 0
    for d in range(1,56):
        if (inp[5+d]!=replacementValue and inp[4+d]!=replacementValue):
            c = (inp[5+d]-inp[4+d])
            if (maximum and m < c):
                m = c
            if ( not maximum and m > c):
                m = c
    return m
                
predictors["maxStep"] = downloads.apply(generator_maxStep,axis=1,args=(True,))
predictors["minStep"] = downloads.apply(generator_maxStep,axis=1,args=(False,))

In [None]:
#Standard Deviation
def generator_std(inp):
    if (np.count_nonzero(inp[5:] - replacementValue*np.ones(len(inp[5:]))) == 0):
        return 0
    return np.std(inp[5:])

predictors["std"] = downloads.apply(generator_std,axis = 1)

In [None]:
#Number of missing Values
def generator_missing(inp):
    return list(inp[5:]).count(replacementValue)
    #return np.count_zero(inp[5:] -replacementValue*np.ones(len(inp[5:])))

predictors["nb_missing"] = downloads.apply(generator_missing,axis = 1)

In [None]:
#device
def generator_iphone(inp):
    if (inp[4] == "iphone"):
        return 1
    else:
        return 0

def generator_ipad(inp):
    if(inp[4] == "ipad"):
        return 1
    else:
        return 0
    
predictors["iphone"] = downloads.apply(generator_iphone,axis = 1)
predictors["ipad"] = downloads.apply(generator_ipad,axis = 1)

In [None]:
#Categories
def generator_categories(inp,cat):
    if (inp[3] == cat):
        return 1
    else:
        return 0
    

for cat in list(set(downloads["category"])):
    predictors[cat] = downloads.apply(generator_categories,axis = 1,args=(cat,))

In [None]:
def detect_language(x):
    try:
        detected = langdetect.detect(x.decode('utf8','ignore'))
        if detected in ['ja']:
            return 'japanese'
        elif detected in ['zh-cn']:
            return 'chinese'
        elif detected in ['ko']:
            return 'korean'
        elif detected in ['en']:
            return 'english'
        return 'other'
    except:
        return 'other'
        #return None

def set_lang_categories(x, cat):
    if x == cat:
        return 1
    else:
        return 0
    
lang_series = predictors['name'].apply(detect_language)
for cat in list(set(lang_series)):
    predictors[cat] = lang_series.apply(set_lang_categories, args=(cat,))
    

#### Use the train_cumulative_downloads_2015-02.csv file

In [None]:
#prev_downloads = pd.read_csv('train_cumulative_downloads_2015-02.csv').drop('Unnamed: 0', 1)
predictors = pd.merge(predictors, prev_downloads, how='left',
                  on=["id","device"])

In [None]:
predictors['dl_projection'] = predictors['cumulative_downloads_2015-02'] + 7 * predictors['download_sum']

#### Use the train_app_rating.csv file and reviews

In [None]:
avg_reviews = reviews.groupby('id').agg('mean')
avg_reviews['rating']
predictors['avg_review'] = predictors.join(avg_reviews['rating'],on='id')['rating'].replace("NaN",0)


In [None]:
avg_reviews = reviews.groupby('id').agg('var')
avg_reviews['rating']
predictors['var_review'] = predictors.join(avg_reviews['rating'],on='id')['rating'].replace("NaN",0)


In [None]:
ratings['num_ratings'] = ratings.ix[:,['star1','star2','star3','star4','star5']].sum(axis=1)
#Scaling the ratings
for i in range(1,6):
    ratings['star'+str(i)]=ratings['star'+str(i)].divide(ratings['num_ratings']+1)

In [None]:
#raw ratings
predictors = pd.merge(predictors, ratings.drop('Unnamed: 0', 1), how='left',
                   on=["id","name","category","device"]).replace("NaN",replacementValue)


In [None]:
#Reviews per daily downloads
#predictors['reviewPerDailyDownloads'] = predictors['num_ratings'].divide(predictors['daily_avg']+1)
predictors['ratings_per_daily_downloads'] = predictors['num_ratings'].divide(predictors['cumulative_downloads_2015-02']+1)

#Drop either this or num_ratings

In [None]:
num_versions = reviews.groupby('id').version.nunique()
num_versions.name = 'num_versions'
predictors = predictors.join(num_versions, how='left', on='id')
predictors['num_versions'] = predictors['num_versions'].replace('NaN',0)

In [None]:
num_review = reviews.groupby('id').rating.count()
num_review.name = 'num_review'
predictors = predictors.join(num_review, how='left', on='id')
predictors['num_review_per_daily_downloads'] = predictors['num_review'].replace('NaN',0).divide(predictors['cumulative_downloads_2015-02']+1)

In [None]:
'''reviews_withLang = reviews.copy()
reviews_lang = reviews['text'].apply(detect_language)
for cat in list(set(reviews_lang)):
    reviews_withLang[cat] = reviews_lang.apply(set_lang_categories, args=(cat,))

reviews_withLang = reviews_withLang.groupby('id')[list(set(reviews_lang))].sum().reset_index()

def gini_impurity(inp):
    tot = sum(inp[1:])
    return sum([1.0*x / tot * (1 - 1.0*x / tot) for x in inp[1:]])

reviews_withLang["gini_reviews"] = reviews_withLang.apply(gini_impurity,axis=1)
predictors["gini_reviews"] = predictors.merge(reviews_withLang[["id","gini_reviews"]], how='left', on="id")["gini_reviews"]\
.replace("NaN",replacementValue)'''

#### Use the train_release_date.csv file

In [None]:
def generate_days_since_release(x):
    return (datetime.datetime.strptime('03/01/2015', '%m/%d/%Y').date() - datetime.datetime.strptime(x, '%Y-%m-%d').date()).days

release_date = pd.read_csv('train_release_date.csv').drop('Unnamed: 0', 1)
release_date['days_since_release'] = release_date['release_date'].apply(generate_days_since_release)
predictors = pd.merge(predictors, release_date.drop('release_date', 1), how='left',
                  on=["id","name"])

# replacing any missing values with 0; not a good idea but placeholder for now
predictors['days_since_release'] = predictors['days_since_release'].replace('NaN', 0)

# The number of downloads divided by time
predictors['downloads_per_day_before'] = predictors['cumulative_downloads_2015-02'].divide(predictors['days_since_release']+1)

In [None]:

'''
#Generate the polynomial coefficients
def generator_coef(inp, coef):
    tmp = list(inp.values)
    #print tmp[60:61]
    #print int(tmp[59:60][0])
    Y = np.array(tmp[60:61]*(int(tmp[59:60][0])/7)+tmp[4:12])
    X = range(Y.shape[0])
    return  np.polyfit(X,Y,coef)[0]
#Redo by ignoring the -1 / 0 ? w = [1110011]     w = np.not_equal(inp[5:],np.ones(len(inp[5:]))*replacementValue

for c in range(4):
    predictors["coef_"+str(c)] = predictors.apply(generator_coef,axis=1,args=(c,))
    '''

#### Use the sentiment score

In [None]:
#Later we can compute the weighted average of sentiment scores based on reviewers.
#add positive and negative columns to indicate the app's popularity

#print reviews.ix[id==predictors.id[0],:]["sentiment_score"]
#print reviews.ix[id==predictors.id[1],:]["sentiment_score"]

avg_score = [0.5]*predictors.shape[0]
predictors["positive"] = [0]*predictors.shape[0]
predictors["negative"] = [0]*predictors.shape[0]

reviewers = {}
for i in range(reviews.shape[0]):
    user = reviews['reviewer'].values[i]
    item_id = reviews['id'].values[i]
    if user in reviewers:
        reviewers[user] = reviewers[user] + predictors["week_8"][predictors.id==item_id].values.sum()
    else:
        reviewers[user] = predictors["week_8"][predictors.id==item_id].values.sum()
        
for i in range(predictors.shape[0]):
    score_list = list(reviews.ix[reviews["id"]==predictors.id[i],:]['sentiment_score'].values)
    reviewer_list =  list(reviews.ix[reviews["id"]==predictors.id[i],:]['reviewer'].values)
    s = 0
    rs = 0 
    if len(score_list)==0: continue
    for j in range(len(score_list)):
        s = s + score_list[j]*reviewers[reviewer_list[j]]
        rs = rs + reviewers[reviewer_list[j]]
    avg_score[i] = s/rs
    #avg_score[i] = reviews.ix[reviews["id"]==predictors.id[i],:]["sentiment_score"].mean()
    if avg_score[i]>0.55: 
        predictors["positive"].values[i] = 1
    elif avg_score[i]<0.45: predictors["negative"].values[i] = 1
predictors["avg_sentiment_score"] = avg_score

 

#### Use coeficients of metrics 

In [11]:
for i in range(4):
    predictors["m"+str(i+1)+"_max"] = np.zeros(predictors.shape[0]) 
    #predictors["m"+str(i+1)+"_min"] = np.zeros(predictors.shape[0])
    predictors["m"+str(i+1)+"_mean"] = np.zeros(predictors.shape[0])
    #predictors["m"+str(i+1)+"_std"] = np.zeros(predictors.shape[0])
    for j in range(3):
        predictors["m"+str(i+1)+"_coef_"+str(j)] = np.zeros(predictors.shape[0])

for i in range(predictors.shape[0]):
    if predictors["id"].values[i] not in usages["id"].values: continue
    for j in range(4):
        tmp = usages.ix[usages["id"] == predictors["id"].values[i],:]
        time_series = np.array(tmp.ix[tmp["metric"] == j+1,6:14])[0]   
        if -1 in time_series: continue
        predictors["m"+str(j+1)+"_max"].values[i] = time_series.max()
        predictors["m"+str(j+1)+"_mean"].values[i] = time_series.mean()
        X = np.array(range(8))
        fit = np.polyfit(X,time_series,2)
        for k in range(3):
            predictors["m"+str(j+1)+"_coef_"+str(k)].values[i] = fit[2-k]


#### Use coeficients of revenue

In [10]:
for j in range(3):
    predictors["rev_coef_"+str(j)] = np.zeros(predictors.shape[0])
    predictors["rev_max"] = np.zeros(predictors.shape[0])
    #predictors["rev_min"] = np.zeros(predictors.shape[0])
    predictors["rev_mean"] = np.zeros(predictors.shape[0])
    #predictors["rev_std"] = np.zeros(predictors.shape[0])
for i in range(predictors.shape[0]): 
    if predictors["id"].values[i] in revenues["id"].values: 
        curr_rev  = revenues.ix[revenues["id"]== predictors["id"].values[i],:]
        if predictors["device"].values[i] in curr_rev["device"].values:
            curr_rev = curr_rev.ix[curr_rev["device"] == predictors["device"].values[i],:]
            time_series = np.array(curr_rev.ix[:,5:61])[0]
            if -1 in time_series: continue 
            predictors["rev_max"].values[i] = time_series.max()
            predictors["rev_mean"].values[i] = time_series.mean()
            X = np.array(range(56))
            fit = np.polyfit(X,time_series,2)
            for k in range(3):  predictors["rev_coef_"+str(k)].values[i] = fit[2-k]            

#### Smart Standardization accross the categories


In [None]:
cat_mean = predictors.groupby('category').agg('mean').reset_index()
cat_var = predictors.groupby('category').agg('var').reset_index()
cat_std = cat_mean.merge(cat_var,on = "category",suffixes=('_mean', '_var'))

def smart_standardize(inp):
    temp = predictors[["id","name","category","device",inp]].merge(cat_std[["category",inp+"_mean",inp+"_var"]],how="left",on="category")
    return 1.0*(predictors[inp] - temp[inp+"_mean"])/(temp[inp+"_var"]+1)


'''
for i in range(1,6):
    predictors["star"+str(i)] = smart_standardize("star"+str(i))
    
for j in range(3):
    predictors["rev_coef_"+str(j)]=  smart_standardize("rev_coef_"+str(j))
for i in range(4):
    predictors["m"+str(i+1)+"_max"]  = smart_standardize("m"+str(i+1)+"_max") 
    predictors["m"+str(i+1)+"_min"]  = smart_standardize("m"+str(i+1)+"_min")
    predictors["m"+str(i+1)+"_mean"]  = smart_standardize("m"+str(i+1)+"_mean")
    predictors["m"+str(i+1)+"_std"]   = smart_standardize("m"+str(i+1)+"_std")
for j in range(3):
        predictors["m"+str(i+1)+"_coef_"+str(j)]  = smart_standardize("m"+str(i+1)+"_coef_"+str(j))'''

## To csv



In [14]:
predictors.to_csv("predictors.csv")

In [15]:
predictors = pd.read_csv('predictors.csv').drop('Unnamed: 0', 1)
predictors = predictors.fillna(0)

In [16]:
predictors.head(5)

Unnamed: 0,id,name,category,device,week_1,week_2,week_3,week_4,week_5,week_6,week_7,week_8,daily_avg,download_sum,coef_0,coef_1,coef_2,coef_3,maxStep,minStep,std,nb_missing,iphone,ipad,Productivity,Entertainment,Travel,Sports,Music,Shopping,Finance,Business,Navigation,Food and Drink,Utilities,Newsstand,Health and Fitness,News,Lifestyle,Medical,Weather,Games,Catalogs,Social Networking,Photo and Video,Reference,Books,Education,korean,other,japanese,chinese,english,cumulative_downloads_2015-02,dl_projection,avg_review,var_review,star1,star2,star3,star4,star5,num_ratings,ratings_per_daily_downloads,num_versions,num_review,num_review_per_daily_downloads,days_since_release,downloads_per_day_before,positive,negative,m1_max,m1_mean,m1_coef_0,m1_coef_1,m1_coef_2,m2_max,m2_mean,m2_coef_0,m2_coef_1,m2_coef_2,m3_max,m3_mean,m3_coef_0,m3_coef_1,m3_coef_2,m4_max,m4_mean,m4_coef_0,m4_coef_1,m4_coef_2,rev_coef_0,rev_max,rev_mean,rev_coef_1,rev_coef_2,ratio_latest_ver,avg_sentiment_score
0,281704574,"AIM: Chat, Free Text, Photo Share, Voice Message",Social Networking,iphone,6.239161,6.127804,6.101279,6.183265,6.078625,5.926926,5.792578,6.059791,433.678571,24286,433.678571,-2.585304,0.035204,0.003661,308,-233,91.952066,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,617121,787123,3.2,2.684058,0.189152,0.08059,0.145992,0.139242,0.445023,469040,0.760044,3,70,0.000113,2424,254.482887,0,1,0.035065,0.032192,0.0324,-0.001056,0.000199,0.216173,0.212028,0.214854,-0.000385,-8.4e-05,16.729371,15.190247,15.083152,-0.472173,0.100554,96931,88379.625,88130.375,-2675.89881,549.422619,0.0,0,0.0,0.0,0.0,44.285714,0.295056
1,281922769,Mobile MIM,Medical,ipad,4.075113,4.084775,3.964886,4.198275,4.089571,4.027899,4.124828,3.991626,58.678571,3286,58.678571,-0.028161,-0.00504,-0.000418,28,-34,8.860239,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,83281,106283,1.0,0.0,0.256757,0.131532,0.172072,0.142342,0.296396,1109,0.013316,1,1,1.2e-05,2424,34.34268,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,100.0,0.253146
2,281922769,Mobile MIM,Medical,iphone,4.576183,4.532599,4.48703,4.507715,4.480578,4.464265,4.439284,4.610868,91.267857,5111,91.267857,-0.048223,0.014281,0.000401,51,-48,12.69349,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,99000,134777,1.0,0.0,0.256757,0.131532,0.172072,0.142342,0.296396,1109,0.011202,1,1,1e-05,2424,40.824742,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,100.0,0.253146
3,281935788,Epocrates References & Tools for Healthcare Pr...,Medical,ipad,5.61781,5.557379,5.520317,5.371302,5.493649,5.418637,5.434969,5.393628,239.660714,13421,239.660714,-0.934279,0.019362,-0.000439,120,-73,31.992341,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,291496,385443,2.666667,2.941176,0.281447,0.10309,0.140426,0.140326,0.33469,50353,0.172739,3,18,6.2e-05,2424,120.204536,0,1,0.133475,0.126261,0.12898,-0.000505,-5.4e-05,0.642443,0.637267,0.6306,0.004964,-0.000612,20.860287,19.811508,20.453041,-0.239506,0.011242,363825,346531.75,350810.583333,-516.928571,-141.119048,415.328688,1708,359.142857,-0.732351,-0.035426,61.111111,0.23811
4,281935788,Epocrates References & Tools for Healthcare Pr...,Medical,iphone,7.038282,7.073875,7.173192,7.016097,7.020318,7.022613,7.028075,7.001246,1150.660714,64437,1150.660714,-1.639337,-0.069742,0.006644,394,-276,154.055797,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,969059,1420118,2.666667,2.941176,0.281447,0.10309,0.140426,0.140326,0.33469,50353,0.051961,3,18,1.9e-05,2424,399.611959,0,1,0.133475,0.126261,0.12898,-0.000505,-5.4e-05,0.642443,0.637267,0.6306,0.004964,-0.000612,20.860287,19.811508,20.453041,-0.239506,0.011242,363825,346531.75,350810.583333,-516.928571,-141.119048,4691.119653,10011,5959.821429,195.635228,-4.040557,61.111111,0.23811


## How good did the predictor perform   --> Start running from here

In [1]:
#imported libraries
import pandas as pd
import numpy as np
import scipy as sp
import math
import matplotlib.pyplot as plt
import datetime
%matplotlib inline  
from sklearn import linear_model
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

#Creating of the input data
downloads = pd.read_csv('train_app_downloads.csv')
reviews = pd.read_csv('train_app_review.csv')
ratings = pd.read_csv('train_app_rating.csv')
usages = pd.read_csv('train_usage.csv')
revenues = pd.read_csv('train_revenue.csv')
output = pd.read_csv('train_final_downloads.csv')
dateRange = pd.date_range('2015-03-01', periods=56).format(formatter=lambda x: x.strftime('%Y-%m-%d'))

sentiment = pd.read_csv('sentiment.csv',header=-1).ix[:,0]
sentiment.values[sentiment.values==0.0] = 0.5
reviews["sentiment_score"] = sentiment.values 

#We map -1 to 0 in the downloads (there are no 0 in the initial data)
replacementValue=0
downloads = downloads.replace(-1,replacementValue)

#Minor corrections
ratings = ratings.rename(columns={'start1': 'star1'})
ratings = pd.merge(downloads.drop(dateRange,1), ratings.drop('Unnamed: 0', 1), how='left',
                   on=["id","name","category"]).replace("NaN",replacementValue)

In [3]:
#This is the metric we use to determine our performance
def metric(y_pred,y_test,percent=1):
    top = int(len(y_pred)/100.0*percent)
    return (len(set([i[0] for i in sorted(enumerate(y_pred), key=lambda x:x[1],reverse=True)][0:top])
       .intersection([i[0] for i in sorted(enumerate(y_test), key=lambda x:x[1],reverse=True)][0:top])
               ))/(percent/100.0)/len(y_pred)*100

In [4]:
predictors = pd.read_csv('predictors.csv').drop('Unnamed: 0', 1)
predictors = predictors.fillna(0)

## Predictor selection 2 methods. Cannot run both...

### Forward Predictor selection

In [None]:
#Use a model to do a forward recursive predictor selection
#mod=linear_model.Lasso(alpha=100,fit_intercept=False)
#mod=linear_model.LinearRegression(fit_intercept=False)
#mod= RandomForestRegressor(max_features = 1.0/3.0,n_estimators = 100) 
#mod= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100) 
mod = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01,max_depth=2, random_state=0, loss='ls')\

rfe = RFE(estimator=mod, n_features_to_select=1, step=1)
rfe.fit(predictors.as_matrix()[:,4:], output.as_matrix()[:,5])
ranking = rfe.ranking_

In [None]:
for e in [(i[0],i[1]) for i in sorted(zip(predictors.columns[4:],ranking),key=lambda x: x[1])]:
    print e

In [None]:
#manually drop predictors 
#predictors_to_drop = []
nb_pred_to_keep = 60
predictors_to_drop = [i[0] for i in sorted(zip(predictors.columns[4:],ranking),key=lambda x: x[1])][nb_pred_to_keep:]
for col in predictors_to_drop:
    predictors = predictors.drop(col,1)
    

### Other predictor selection method using Lasso

In [None]:
# Other method
predictors = pd.read_csv('predictors.csv').drop('Unnamed: 0', 1)
predictors = predictors.fillna(0)
#var_select = linear_model.Lasso(alpha = 0.01).fit(predictor_train_top_10_precent.as_matrix()[:,4:],np.log(output_train_top_10_precent["cumulative_downloads_2016-02"].as_matrix()))
var_select = linear_model.Lasso(alpha = 0.1).fit(predictors.as_matrix()[:,4:],np.log(output["cumulative_downloads_2016-02"].as_matrix()))

importance_list = [i for i in range(len(var_select.coef_)) if abs(var_select.coef_[i])>0]
important_predictors = [predictors.columns.values[i+4] for i in range(len(var_select.coef_)) if (i in importance_list)]
print important_predictors
for col in predictors.columns.values[4:]:
    if col not in important_predictors:
        predictors = predictors.drop(col,1)

In [None]:
predictors.columns.values

# Classification + Regression

In [None]:
print list(predictors.columns.values) 
np.random.seed(5)
K = 5

top_percent_classif = 20


kf = KFold(len(predictors), n_folds=K)
old_top = []
new_top = []
new_top_noClassif = []
top_10 = []
new_top_select = [] 
new_top_noClassif = [] 
for train, test in kf:
    #base model
    '''old_mod=linear_model.LinearRegression(fit_intercept=False).fit(predictors.as_matrix()[train,4:12], output.as_matrix()[train,5])
    old_y_pred =  old_mod.predict(predictors.as_matrix()[test,4:12])
    old_top.append(metric(old_y_pred,output.as_matrix()[test,5]))'''
    
    #model to determine the top 10%   (CLASSIFICATION)
    #mod_class10=GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01,max_depth=2, random_state=0, loss='ls')\
    mod_class10= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100)\
    .fit(predictors.as_matrix()[train,4:], output.as_matrix()[train,5])
    
    y_pred =  mod_class10.predict(predictors.as_matrix()[test,4:])
    
    #estimate of the top 10% of the test set
    estimate_class10 = predictors.iloc[test].copy()
    estimate_class10["firstEstimate"] = y_pred
    estimate_class10 = estimate_class10.sort_values(by= "firstEstimate",ascending = False).iloc[0:int(1.0*top_percent_classif/100.0*len(estimate_class10))]
    estimate_class10 = estimate_class10.drop("firstEstimate",1)
#     estimate_class10 = estimate_class10.sort_values(by= "daily_avg",ascending = False).iloc[0:int(1.0*top_percent_classif/100.0*len(estimate_class10))]

    
    #top 10% of the trainning set
    output_train_top_10_precent = output.iloc[train].copy().sort_values(by= 'cumulative_downloads_2016-02',ascending = False).iloc[0:int(1.0*top_percent_classif/100.0*len(output.iloc[train]))].drop('Unnamed: 0',1)
    predictor_train_top_10_precent = output_train_top_10_precent.merge(predictors, how='left', on=["id","name","category","device"]).copy()
    predictor_train_top_10_precent = predictor_train_top_10_precent.drop('cumulative_downloads_2016-02',1)
    #predictor_train_top_10_precent = predictor_train_top_10_precent.drop('firstEstimate',1)
    
    #This is the actual top 1% of the test set
    output_test_top_1_precent = output.iloc[test].sort_values(by= 'cumulative_downloads_2016-02',ascending = False).iloc[0:int(0.01*len(output.iloc[test]))].copy()

    
    #second model -> Regression on the top obtainned by regression
    #mod_top1= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100)\
    #mod_top1= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100)\
    #mod_top1=linear_model.Lasso(alpha=100,fit_intercept=False)\
    #mod_top1=GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0, loss='ls')\
    mod_top1 = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01,max_depth=2, random_state=0, loss='ls')\
    .fit(predictor_train_top_10_precent.as_matrix()[:,4:], np.log(output_train_top_10_precent["cumulative_downloads_2016-02"].as_matrix()))

    
    y_pred_2 =  mod_top1.predict(estimate_class10.as_matrix()[:,4:])
    
    
    '''#Andrew's regression with additionnal lasso predictor selection
    #(1) feature selection : lasso or random forest
    #var_select = linear_model.Lasso(alpha = 0.01).fit(predictor_train_top_10_precent.as_matrix()[:,4:],np.log(output_train_top_10_precent["cumulative_downloads_2016-02"].as_matrix()))
    var_select = RandomForestRegressor(max_features = "sqrt",n_estimators = 100).fit(predictor_train_top_10_precent.as_matrix()[:,4:], np.log(output_train_top_10_precent["cumulative_downloads_2016-02"].as_matrix()))
      
    #importance_list = [i for i in  range(len(var_select.coef_)) if abs(var_select.coef_[i])>0]    
    importance_list = list(reversed(np.argsort(var_select.feature_importances_)))[0:20]
    
    #new train and test set with the selected variables
    X_rf_train = predictor_train_top_10_precent.as_matrix()[:,4:][:,importance_list]
    y_rf_train = output_train_top_10_precent["cumulative_downloads_2016-02"].as_matrix()
    X_rf_test = estimate_class10.as_matrix()[:,4:][:,importance_list]  
    
    #(2) regression: random forest or boosting 
    #mod_top1_select= RandomForestRegressor(max_features = "sqrt",n_estimators = 100).fit(X_rf_train, y_rf_train)
    params = {'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.01, 'loss': 'ls'}
    mod_top1_select= GradientBoostingRegressor(**params).fit(X_rf_train, y_rf_train)
    #mod_top1_select = SVR(kernel = 'poly',degree = 3).fit(X_rf_train,y_rf_train)
    y_pred_3 = mod_top1_select.predict(X_rf_test)
    
    #No Classification model
    mod_noClassif= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100).fit(predictors.as_matrix()[train,4:], output.as_matrix()[train,5])
    y_pred_noClassif =  mod_noClassif.predict(predictors.as_matrix()[test,4:])'''

    
    
    
    
    estimate_top1 = estimate_class10.copy()
    #estimate_top1_select = estimate_class10.copy()
    #estimate_top1_noClassif = predictors.iloc[test].copy()
    estimate_top1["secondEstimate"] = y_pred_2
    #estimate_top1_select["thirdEstimate"] = y_pred_3
    #estimate_top1_noClassif["noClassifEstimate"] = y_pred_noClassif
    estimate_top1 = estimate_top1.sort_values(by= "secondEstimate",ascending = False).iloc[:int(0.01*len(output.iloc[test]))]
    
    #estimate_top1_select = estimate_top1_select.sort_values(by= "thirdEstimate",ascending = False).iloc[0:int(0.01*len(output.iloc[test]))]
    #estimate_top1_noClassif = estimate_top1_noClassif.sort_values(by= "noClassifEstimate",ascending = False).iloc[0:int(0.01*len(output.iloc[test]))]

    estimation_error = len(estimate_top1.merge(output_test_top_1_precent, how='inner', on=["id","name","category","device"]))*100.0/len(output_test_top_1_precent)
    new_top.append(estimation_error)
    #new_top_select.append(len(estimate_top1_select.merge(output_test_top_1_precent, how='inner', on=["id","name","category","device"]))*100.0/len(output_test_top_1_precent))
    #new_top_noClassif.append(len(estimate_top1_noClassif.merge(output_test_top_1_precent, how='inner', on=["id","name","category","device"]))*100.0/len(output_test_top_1_precent))
    top_10.append(len(estimate_class10.merge(output_test_top_1_precent, how='inner', on=["id","name","category","device"]))*100.0/len(output_test_top_1_precent))
       
#print "Old model           : " + str(1.0*sum(old_top)/len(old_top))
print "Top10%              : " + str(1.0*sum(top_10)/len(top_10))
print "Top1% with classif1 : " + str(1.0*sum(new_top)/len(new_top))
#print "Top1% with classif2 : " + str(1.0*sum(new_top_select)/len(new_top_select))
#print "Top1% no classif1   : " + str(1.0*sum(new_top_noClassif)/len(new_top_noClassif))

In [None]:
var_select.feature_importances_

In [None]:
len(predictor_train_top_10_precent)

## Cross Validation k-folds

In [None]:
K = 5
print list(predictors.columns.values)
np.random.seed(1)
kf = KFold(len(predictors), n_folds=K)
new_top = []
for train, test in kf:

    #model
    #mod=linear_model.LinearRegression(fit_intercept=False)\ 
    #mod=linear_model.Lasso(alpha=100,fit_intercept=False)\
    #mod = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0, loss='ls')\ 
    #mod= RandomForestRegressor(max_features = 1.0/3.0,n_estimators = 100)\
    #mod= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100)\
    mod = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01,max_depth=2, random_state=0, loss='ls')\
    .fit(predictors.as_matrix()[train,4:], output.as_matrix()[train,5]) 

    y_pred =  mod.predict(predictors.as_matrix()[test,4:])
    new_top.append(metric(y_pred,output.as_matrix()[test,5]))

print 1.0*sum(new_top)/len(new_top)

## Attempt at boosting

In [None]:
print list(predictors.columns.values)
K = 5

for n_est in [1000,5000,10000,50000]:
    for l_rate in [0.001,0.01,0.1]:
        for m_depth in range(1,5):

            np.random.seed(1)
            kf = KFold(len(predictors), n_folds=K)
            old_top = []
            new_top = []
            for train, test in kf:
                #model
                mod = GradientBoostingRegressor(n_estimators=n_est, learning_rate=l_rate,max_depth=m_depth, random_state=0, loss='ls').fit(predictors.as_matrix()[train,4:], output.as_matrix()[train,5]) 
                y_pred =  mod.predict(predictors.as_matrix()[test,4:])
                new_top.append(metric(y_pred,output.as_matrix()[test,5]))

            print "Performance with \tB="+str(n_est)+"\tl="+str(l_rate)+"\td="+ str(m_depth)+" is: \t"+str(1.0*sum(new_top)/len(new_top))



## Attempt at SVR

In [None]:
from sklearn.svm import SVR

print list(predictors.columns.values)
K = 5

for ker in ['linear', 'poly', 'rbf', 'sigmoid']:
    for deg in range(5):
        for coef in [0,1,2]:
            for Cost in [0.001,0.01,0.1,1,10]:
                for epsi in [0.001,0.01,0.1,1,10]:

                    np.random.seed(1)
                    kf = KFold(len(predictors), n_folds=K)
                    old_top = []
                    new_top = []
                    for train, test in kf:
                        #model
                        #mod = SVR(kernel=ker, degree=deg,  coef0=coef, C=Cost, epsilon=epsi).fit(predictors.as_matrix()[train,4:], output.as_matrix()[train,5]) 
                        mod = SVR(kernel=ker, degree=deg,  coef0=coef, C=Cost, epsilon=epsi,max_iter=1000).fit(predictors.as_matrix()[train,4:], output.as_matrix()[train,5]) 
                        y_pred =  mod.predict(predictors.as_matrix()[test,4:])
                        new_top.append(metric(y_pred,output.as_matrix()[test,5]))

                    print "Performance with \tKernel="+ker+"\tdegree="+str(deg)+"\tcoef0="+ str(coef)+"\tCost="+str(Cost)+"\tepsilon="+ str(epsi)+" is: \t"+str(1.0*sum(new_top)/len(new_top))



# CV on the size of the Classification top

In [None]:
print list(predictors.columns.values) 

K = 5

cv_top_10 = []
cv_top_1  = []

for top in range(1,31):
np.random.seed(1)
    top_percent_classif = top

    kf = KFold(len(predictors), n_folds=K)
    old_top = []
    new_top = []
    top_10 = []
    new_top_select = [] 
    for train, test in kf:
        #base model
        old_mod=linear_model.LinearRegression(fit_intercept=False).fit(predictors.as_matrix()[train,4:12], output.as_matrix()[train,5])
        old_y_pred =  old_mod.predict(predictors.as_matrix()[test,4:12])
        old_top.append(metric(old_y_pred,output.as_matrix()[test,5]))

        #model to determine the top 10%   (CLASSIFICATION)
        mod_class10= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100).fit(predictors.as_matrix()[train,4:], output.as_matrix()[train,5])

        y_pred =  mod_class10.predict(predictors.as_matrix()[test,4:])

        #estimate of the top 10% of the test set
        estimate_class10 = predictors.iloc[test].copy()
        estimate_class10["firstEstimate"] = y_pred
        #estimate_class10 = estimate_class10.sort_values(by= "firstEstimate",ascending = False).iloc[1:int(1.0*top_percent_classif/100.0*len(estimate_class10))]
        estimate_class10 = estimate_class10.drop("firstEstimate",1)
        estimate_class10 = estimate_class10.sort_values(by= "daily_avg",ascending = False).iloc[1:int(1.0*top_percent_classif/100.0*len(estimate_class10))]


        #top 10% of the trainning set
        output_train_top_10_precent = output.iloc[train].copy().sort_values(by= 'cumulative_downloads_2016-02',ascending = False).iloc[1:int(1.0*top_percent_classif/100.0*len(output.iloc[train]))].drop('Unnamed: 0',1)
        predictor_train_top_10_precent = output_train_top_10_precent.merge(predictors, how='left', on=["id","name","category","device"]).copy()
        predictor_train_top_10_precent = predictor_train_top_10_precent.drop('cumulative_downloads_2016-02',1)
        #predictor_train_top_10_precent = predictor_train_top_10_precent.drop('firstEstimate',1)

        #This is the actual top 1% of the test set
        output_test_top_1_precent = output.iloc[test].sort_values(by= 'cumulative_downloads_2016-02',ascending = False).iloc[1:int(0.01*len(output.iloc[test]))].copy()


        #second model -> Regression on the top obtainned by regression
        #mod_top1= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100).fit(predictor_train_top_10_precent.as_matrix()[:,4:], output_train_top_10_precent["cumulative_downloads_2016-02"].as_matrix())
        mod_top1= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100).fit(predictor_train_top_10_precent.as_matrix()[:,4:], np.log(output_train_top_10_precent["cumulative_downloads_2016-02"].as_matrix()))
        #mod_top1=linear_model.Lasso(alpha=100,fit_intercept=False).fit(predictor_train_top_10_precent.as_matrix()[:,4:], np.log(output_train_top_10_precent["cumulative_downloads_2016-02"].as_matrix()))
        #mod_top1=GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0, loss='ls').fit(predictor_train_top_10_precent.as_matrix()[:,4:], (output_train_top_10_precent["cumulative_downloads_2016-02"].as_matrix()))

        y_pred_2 =  mod_top1.predict(estimate_class10.as_matrix()[:,4:])

        estimate_top1 = estimate_class10.copy()
        estimate_top1["secondEstimate"] = y_pred_2

        estimate_top1 = estimate_top1.sort_values(by= "secondEstimate",ascending = False).iloc[1:int(0.01*len(output.iloc[test]))]

        estimation_error = len(estimate_top1.merge(output_test_top_1_precent, how='inner', on=["id","name","category","device"]))*100.0/len(output_test_top_1_precent)
        new_top.append(estimation_error)
        top_10.append(len(estimate_class10.merge(output_test_top_1_precent, how='inner', on=["id","name","category","device"]))*100.0/len(output_test_top_1_precent))


    cv_top_10.append(1.0*sum(top_10)/len(top_10))
    cv_top_1.append(1.0*sum(new_top)/len(new_top))

plt.plot(range(1,31),cv_top_10,label = 'top 10 estimate')
plt.plot(range(1,31),cv_top_1,label = 'top 1 estimate')
plt.legend(loc = 'best')




# CV on the size of the predictor set

In [None]:

np.random.seed(1)
K = 5


for nb_pred_to_keep in range(5,100,5):

    predictors = pd.read_csv('predictors.csv').drop('Unnamed: 0', 1)
    predictors = predictors.fillna(0)

    predictors_to_drop = [i[0] for i in sorted(zip(predictors.columns[4:],ranking),key=lambda x: x[1])][nb_pred_to_keep:]
    for col in predictors_to_drop:
        predictors = predictors.drop(col,1)

    kf = KFold(len(predictors), n_folds=K)
    #print list(predictors.columns.values) 
    new_top_noClassif = [] 
    for train, test in kf:

        output_test_top_1_precent = output.iloc[test].sort_values(by= 'cumulative_downloads_2016-02',ascending = False).iloc[1:int(0.01*len(output.iloc[test]))].copy()


        #No Classification model
        mod_noClassif= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100).fit(predictors.as_matrix()[train,4:], output.as_matrix()[train,5])
        y_pred_noClassif =  mod_noClassif.predict(predictors.as_matrix()[test,4:])

        estimate_top1_noClassif = predictors.iloc[test].copy()
        estimate_top1_noClassif["noClassifEstimate"] = y_pred_noClassif
        estimate_top1_noClassif = estimate_top1_noClassif.sort_values(by= "noClassifEstimate",ascending = False).iloc[1:int(0.01*len(output.iloc[test]))]

        new_top_noClassif.append(len(estimate_top1_noClassif.merge(output_test_top_1_precent, how='inner', on=["id","name","category","device"]))*100.0/len(output_test_top_1_precent))
        

    print "Top1% no classif with "+str(nb_pred_to_keep)+" kept predictors : " + str(1.0*sum(new_top_noClassif)/len(new_top_noClassif))

RForest pred selection and CV on the number of predictors<br/>['id', 'name', 'category', 'device', 'week_1', 'week_2', 'week_3', 'week_4', 'week_5', 'week_6', 'week_7', 'week_8', 'daily_avg', 'coef_0', 'coef_1', 'coef_2', 'coef_3', 'maxStep', 'minStep', 'std', 'nb_missing', 'iphone', 'ipad', 'Productivity', 'Entertainment', 'Travel', 'Sports', 'Music', 'Shopping', 'Finance', 'Business', 'Navigation', 'Food and Drink', 'Utilities', 'Newsstand', 'Health and Fitness', 'News', 'Lifestyle', 'Medical', 'Weather', 'Games', 'Catalogs', 'Social Networking', 'Photo and Video', 'Reference', 'Books', 'Education', 'chinese', 'Unnamed: 49', 'japanese', 'other', 'english', 'korean', 'cumulative_downloads_2015-02', 'avg_review', 'var_review', 'star1', 'star2', 'star3', 'star4', 'star5', 'num_ratings', 'reviewPerDailyDownloads', 'num_versions', 'num_review', 'days_since_release', 'downloads_per_day_before', 'positive', 'negative', 'm1_min', 'm1_mean', 'm1_std', 'm1_coef_0', 'm1_coef_1', 'm1_coef_2', 'm2_max', 'm2_min', 'm2_coef_0', 'm2_coef_1', 'm2_coef_2', 'm3_min', 'm3_mean', 'm3_std', 'm3_coef_0', 'm3_coef_1', 'm3_coef_2', 'm4_max', 'm4_min', 'm4_mean', 'm4_std', 'm4_coef_0', 'm4_coef_1', 'm4_coef_2', 'rev_coef_0', 'rev_max', 'rev_min', 'rev_std', 'rev_coef_1', 'rev_coef_2'] <br/>
Top1% no classif with 5 kept predictors : 62.5396825397    <br/>
Top1% no classif with 10 kept predictors : 65.0793650794<br/>
Top1% no classif with 15 kept predictors : 65.3968253968<br/>
Top1% no classif with 20 kept predictors : 64.7619047619<br/>
Top1% no classif with 25 kept predictors : 64.7619047619<br/>
Top1% no classif with 30 kept predictors : 65.3968253968<br/>
Top1% no classif with 35 kept predictors : 66.0317460317<br/>
Top1% no classif with 40 kept predictors : 65.7142857143<br/>
Top1% no classif with 45 kept predictors : 65.0793650794<br/>
Top1% no classif with 50 kept predictors : 65.3968253968<br/>
Top1% no classif with 55 kept predictors : 65.0793650794<br/>
Top1% no classif with 60 kept predictors : 65.3968253968<br/>
Top1% no classif with 65 kept predictors : 64.4444444444<br/>
Top1% no classif with 70 kept predictors : 65.3968253968<br/>
Top1% no classif with 75 kept predictors : 63.8095238095<br/>
Top1% no classif with 80 kept predictors : 63.4920634921<br/>
Top1% no classif with 85 kept predictors : 64.4444444444<br/>
Top1% no classif with 90 kept predictors : 61.9047619048<br/>



.
.
.
.

# Old Testing model

In [None]:
old_top = []
new_top = []

test_frac = 0.31  #Fraction of test points
N = 20   #number of iterations
np.random.seed(1)
for i in range(1,N):
    r = np.random.randint(1,429496729)
    X_train, X_test, y_train, y_test = train_test_split(predictors.as_matrix()[:,4:12], output.as_matrix()[:,5], test_size=test_frac, random_state=r)
    X_test = X_test[0:10000]
    y_test = y_test[0:10000]
    old_mod=linear_model.LinearRegression(fit_intercept=False).fit(X_train,y_train)
    old_y_pred =  old_mod.predict(X_test)
    old_top.append(metric(old_y_pred,y_test))
    
np.random.seed(1)
for i in range(1,N):
    r = np.random.randint(1,429496729)
    X_train, X_test, y_train, y_test = train_test_split(predictors.as_matrix()[:,4:], output.as_matrix()[:,5], test_size=test_frac, random_state=r)
    X_test = X_test[0:10000]
    y_test = y_test[0:10000]
    #mod=linear_model.LinearRegression(fit_intercept=False).fit(X_train,y_train)
    #mod=linear_model.Lasso(alpha=100,fit_intercept=False).fit(X_train,y_train)
    #mod= RandomForestRegressor(max_features = 1.0/3.0,n_estimators = 100).fit(X_train,y_train)
    mod= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100).fit(X_train,y_train)
    y_pred =  mod.predict(X_test)
    new_top.append(metric(y_pred,y_test))
if (N<300):
    fig = plt.figure()
    plt.plot(range(1,N),old_top,label="old")
    plt.plot(range(1,N),new_top,label="performance")
    plt.legend(loc='best')
    plt.show()

print 1.0*sum(old_top)/len(old_top)
print list(predictors.columns.values)
print 1.0*sum(new_top)/len(new_top)

54.8947368421  
55.0526315789  #with #missing and Lasso <br/>
54.7894736842  #including raw ratings Lasso :( <br/>
55.0526315789  #with #missing weightedSumRatings and Lasso <br/>
55.1052631579  #adding the categories <br/>
55.3684210526  #adding average sentiment score and positive/negative label <br/>
55.5263157895  #adding coeficients of usages


With 10000 only:
54.8947368421 all

56.5263157895 on Lasso all but 'rev_coef_i'

Lasso All alpha = 100
['id', 'name', 'category', 'device', 'week_1', 'week_2', 'week_3', 'week_4', 'week_5', 'week_6', 'week_7', 'week_8', 'daily_avg', 'coef_0', 'coef_1', 'coef_2', 'coef_3', 'maxStep', 'minStep', 'std', 'nb_missing', 'iphone', 'ipad', 'Productivity', 'Entertainment', 'Travel', 'Sports', 'Music', 'Shopping', 'Finance', 'Business', 'Navigation', 'Food and Drink', 'Utilities', 'Newsstand', 'Health and Fitness', 'News', 'Lifestyle', 'Medical', 'Weather', 'Games', 'Catalogs', 'Social Networking', 'Photo and Video', 'Reference', 'Books', 'Education', 'avg_review', 'var_review', 'star1', 'star2', 'star3', 'star4', 'star5', 'positive', 'negative', 'm1_coef_0', 'm1_coef_1', 'm1_coef_2', 'm2_coef_0', 'm2_coef_1', 'm2_coef_2', 'm3_coef_0', 'm3_coef_1', 'm3_coef_2', 'm4_coef_0', 'm4_coef_1', 'm4_coef_2']
57.1052631579

Predictor selection, pick the top 50 features selected in the forward recursive selection with Lasso and then do random forest

54.6842105263
['id', 'name', 'category', 'device', 'week_1', 'week_2', 'week_3', 'week_4', 'week_5', 'week_6', 'week_7', 'week_8', 'daily_avg', 'coef_0', 'coef_1', 'coef_2', 'coef_3', 'maxStep', 'minStep', 'std', 'nb_missing', 'iphone', 'ipad', 'Productivity', 'Entertainment', 'Sports', 'Music', 'Shopping', 'Finance', 'Business', 'Navigation', 'Food and Drink', 'Utilities', 'News', 'Lifestyle', 'Medical', 'Weather', 'Games', 'Catalogs', 'Social Networking', 'Photo and Video', 'Reference', 'Education', 'avg_review', 'var_review', 'star2', 'star3', 'star4', 'positive', 'negative', 'm2_coef_0', 'm3_coef_0', 'm3_coef_1', 'm4_coef_2']
57.3157894737

In [None]:
#Estimation of alpha for the Lasso regression "cros validation" like approach
for a in np.arange(90,110,5):

    old_top = []
    new_top = []

    test_frac = 0.31  #Fraction of test points
    N = 20   #number of iterations


    np.random.seed(1)
    for i in range(1,N):
        r = np.random.randint(1,429496729)
        X_train, X_test, y_train, y_test = train_test_split(predictors.as_matrix()[:,4:], output.as_matrix()[:,5], test_size=test_frac, random_state=r)
        X_test = X_test[0:10000]
        y_test = y_test[0:10000]
        mod=linear_model.Lasso(alpha=a,fit_intercept=False).fit(X_train,y_train) 
        y_pred =  mod.predict(X_test)
        new_top.append(metric(y_pred,y_test))


    print a
    print 1.0*sum(new_top)/len(new_top)
    print " "

# Making Classification on top 10%

In [None]:
predictors = pd.read_csv('predictors.csv').drop('Unnamed: 0', 1)
predictors = predictors.fillna(0)

thr = predictors.sort_values(by= 'week_8',ascending = False).iloc[int(0.1*len(predictors))]["week_8"]
predictors  = predictors[predictors.week_8 >= thr]


len(predictors)

In [None]:
output_classification_10 = output
output_classification_50 = output
output_classification_100 = output
#Is top ?
def generator_istop(inp,threshold):
    if (inp[5] >= threshold):
        return 'True'
    else:
        return 'False'

threshold = output.sort_values('cumulative_downloads_2016-02',ascending = False).iloc[len(output)/10]['cumulative_downloads_2016-02']
output_classification_10["is_top"] = output.apply(generator_istop,axis = 1,args=(threshold,))

threshold = output.sort_values('cumulative_downloads_2016-02',ascending = False).iloc[len(output)/50]['cumulative_downloads_2016-02']
output_classification_50["is_top"] = output.apply(generator_istop,axis = 1,args=(threshold,))

threshold = output.sort_values('cumulative_downloads_2016-02',ascending = False).iloc[len(output)/100]['cumulative_downloads_2016-02']
output_classification_100["is_top"] = output.apply(generator_istop,axis = 1,args=(threshold,))

In [None]:
output_classification_10.head()

In [None]:
from sklearn.svm import SVC
K=10
kf = KFold(32339, n_folds=K)
old_top = []
new_top = []
for train, test in kf:
    #model
    mod = SVC(C=10.0, kernel='poly', degree=3, gamma='auto', coef0=2.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=10, decision_function_shape=None, random_state=None).fit(predictors.as_matrix()[train,4:],output_classification_10.as_matrix()[train,6])
    #mod = linear_model.LogisticRegression().fit(predictors.as_matrix()[train,4:], output_classification_10.as_matrix()[train,6])
    y_pred =  mod.predict(predictors.as_matrix()[test,4:])
    new_top.append(metric_classification(y_pred,output_classification_10.as_matrix()[test,6]))
print list(predictors.columns.values)
print 1.0*sum(new_top)/len(new_top)


In [None]:
###This metric is not very good, put True everywhere and get 100%
def metric_classification(y_pred,y_test):
    nb_top = 0
    for i in range(len(y_test)):
        if (y_pred[i]=='True' and y_test[i]=="False"):
            nb_top+=1
    return 100.0*nb_top/len(y_pred)


In [None]:
list(y_pred).count("False")*1.0/len(y_pred)

In [None]:
print list(y_pred).count('True')
print list(output_classification_10.as_matrix()[test,6]).count('True')
print len(y_pred)

Question, what if instead of using the 'cumulative_downloads_2016-02' we used exp('cumulative_downloads_2016-02') to try to shrunk the lowest points?


In [None]:
output_scaled = output
#Is top ?
def generator_outputScaler(inp):
    #return math.exp(inp[5]*1.0/12329752*100)   #28%
    #return 1.0/(20000000-inp[5])   #0.618429189858
    #return inp[5]**2       #53.8045828797
    #return (inp[5]*1.0/20000000)**2*inp[5]     #46.3834326015
    #return (inp[5]*1.0/1500000)**0.5   #  56.5875142341
    #return 100.0/(1+math.exp(-(inp[5]-1500000)*1.0/100000))    #54.4230120696
    #return 200.0/(1+math.exp(-(inp[5]-1500000)*1.0/1000000)) - 100    #55.9691806875
    #return math.log(inp[5])      #58.7522076851
    return inp[5]               #58.4429930902
    #return 200.0/(1+math.exp(-(inp[5]*1.0/1500000-1)*10)) - 100       #55.3506558544
    #return math.exp(1.0/(1+math.exp(-(inp[5]*1.0/1500000-1)*10)))     #


output_scaled["scaled_downloads"] = output.apply(generator_outputScaler,axis = 1)

#output_scaled["scaled_downloads"] =(output['cumulative_downloads_2016-02']-output['cumulative_downloads_2016-02'].mean())/output['cumulative_downloads_2016-02'].std()

In [None]:
output_scaled.sort_values(by= 'cumulative_downloads_2016-02',ascending = False).head(320)

In [None]:
plt.plot(list(output_scaled.sort_values(by= 'cumulative_downloads_2016-02',ascending = False)["scaled_downloads"]))
plt.axis([0, 300,min(list(output_scaled.sort_values(by= 'cumulative_downloads_2016-02',ascending = False)["scaled_downloads"])),max(list(output_scaled.sort_values(by= 'cumulative_downloads_2016-02',ascending = False)["scaled_downloads"]))])

In [None]:
K = 10

kf = KFold(32339, n_folds=K)
old_top = []
new_top = []
for train, test in kf:
    #base model
    old_mod=linear_model.LinearRegression(fit_intercept=False).fit(predictors.as_matrix()[train,4:12], output.as_matrix()[train,5])
    old_y_pred =  old_mod.predict(predictors.as_matrix()[test,4:12])
    old_top.append(metric(old_y_pred,output.as_matrix()[test,5]))
    #model
    #mod=linear_model.LinearRegression(fit_intercept=False).fit(predictors.as_matrix()[train,4:], output_scaled.as_matrix()[train,6])
    #mod=linear_model.Lasso(alpha=100,fit_intercept=False).fit(predictors.as_matrix()[train,4:], output_scaled.as_matrix()[train,6])
    #mod = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0, loss='ls').fit(predictors.as_matrix()[train,4:], output_scaled.as_matrix()[train,6]) 
    #mod= RandomForestRegressor(max_features = 1.0/3.0,n_estimators = 100).fit(predictors.as_matrix()[train,4:], output_scaled.as_matrix()[train,6])
    mod= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100).fit(predictors.as_matrix()[train,4:], output_scaled.as_matrix()[train,6])
    y_pred =  mod.predict(predictors.as_matrix()[test,4:])
    new_top.append(metric(y_pred,output_scaled.as_matrix()[test,6]))
print 1.0*sum(old_top)/len(old_top)
print list(predictors.columns.values)
print 1.0*sum(new_top)/len(new_top)

In [None]:
plt.plot(list(output.sort_values(by= 'cumulative_downloads_2016-02',ascending = False)["cumulative_downloads_2016-02"]))
plt.axis([0, 3200, 0, 13000000])

In [None]:
plt.plot(list(output_scaled.sort_values(by= 'cumulative_downloads_2016-02',ascending = False)["scaled_downloads"]))


In [None]:
list(set(lang_series))

In [None]:
predictors

In [None]:
predictors.sort_values(by= 'week_8',ascending = False)[0:3233].head(3)

In [None]:
output.sort_values(by= 'cumulative_downloads_2016-02',ascending = False)[0:323].head(3)

In [None]:
len(predictors.sort_values(by= 'week_8',ascending = False)[0:3233].merge(output.sort_values(by= 'cumulative_downloads_2016-02',ascending = False)[0:323], how='inner', on=["id","name","category","device"]))

In [None]:
predictors = predictors[:30]
downloads = downloads[:30]
reviews = reviews[:300]
ratings = ratings[:30]
usages = usages[:30]
revenues = revenues[:30]
output =  output[:30]
dateRange = dateRange[:30]
prev_downloads = prev_downloads[:30]

In [None]:
def detect_language(x):
    try:
        detected = langdetect.detect(x.decode('utf8','ignore'))
        if detected in ['ja']:
            return 'japanese'
        elif detected in ['zh-cn']:
            return 'chinese'
        elif detected in ['ko']:
            return 'korean'
        elif detected in ['en']:
            return 'english'
        return 'other'
    except:
        return 'other'
        #return None

def set_lang_categories(x, cat):
    if x == cat:
        return 1
    else:
        return 0

reviews_withLang = reviews.copy()
reviews_lang = reviews['text'].apply(detect_language)
for cat in list(set(reviews_lang)):
    reviews_withLang[cat] = reviews_lang.apply(set_lang_categories, args=(cat,))
    
#lang_series = predictors['name'].apply(detect_language)
#for cat in list(set(lang_series)):
#    predictors[cat] = lang_series.apply(set_lang_categories, args=(cat,))

reviews_withLang

In [None]:
reviews_withLang = reviews.copy()
reviews_lang = reviews['text'].apply(detect_language)
for cat in list(set(reviews_lang)):
    reviews_withLang[cat] = reviews_lang.apply(set_lang_categories, args=(cat,))

reviews_withLang = reviews_withLang.groupby('id')[list(set(reviews_lang))].sum().reset_index()

def gini_impurity(inp):
    tot = sum(inp[1:])
    return sum([1.0*x / tot * (1 - 1.0*x / tot) for x in inp[1:]])

reviews_withLang["gini_reviews"] = reviews_withLang.apply(gini_impurity,axis=1)
predictors["gini_reviews"] = predictors.join(reviews_withLang["gini_reviews"],on='id')["gini_reviews"].replace("NaN",0)

In [None]:
predictors 

In [None]:
reviews_withLang

In [None]:



estimate_top1 = estimate_class10.copy()
estimate_top1["secondEstimate"] = np.exp(y_pred_2).astype(int)
estimate_top1 = estimate_top1.merge(output, how='left', on=["id","name","category","device"]).copy()
estimate_top1 = estimate_top1.drop("Unnamed: 0",1)
estimate_top1.sort_values(by= 'cumulative_downloads_2016-02',ascending = False)


In [None]:
output_test_top_1_precent.join(estimate_class10.ix[:,['days_since_release','cumulative_downloads_2015-02','week_1','week_4','week_8','download_sum']]).sort_values('cumulative_downloads_2016-02',ascending=False)[~output_test_top_1_precent.index.isin(estimate_top1_select.index)]