#### Initial Setup

In [None]:
#imported libraries
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import boto
import numpy as np
from numpy import matlib
import scipy as sp
import math
import matplotlib.pyplot as plt
import langdetect
import datetime
%matplotlib inline  
from sklearn.svm import SVR
from sklearn import linear_model
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import Imputer

import os
import findspark; findspark.init()
import pyspark
from pyspark import SparkConf
from pyspark.sql.types import *
from pyspark.sql.types import Row
import pyspark.sql.functions as func
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.util import MLUtils

from pyspark.mllib.linalg import Vectors 
from pyspark.mllib.regression import LabeledPoint  
from pyspark.mllib.regression import LinearRegressionWithSGD 

os.environ["PYSPARK_SUBMIT_ARGS"] = (
  "--packages com.databricks:spark-csv_2.11:1.4.0 pyspark-shell"
)

#NEED TO ADD "  SPARK_DRIVER_MEMORY=5G   "  to ./conf/spark-env.sh 

try:
    conf = SparkConf().set("spark.executor.memory", "3g")
    sc = pyspark.SparkContext(conf=conf)
except Exception as e:
    print "SparkContext exists... Continuing on."
    
sqlCtx = pyspark.sql.SQLContext(sc)
sc.setCheckpointDir('checkpoint/')

#### Load files

In [None]:
try:
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", os.environ['AWS_ACCESS_KEY_ID'])
    sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", os.environ['AWS_SECRET_ACCESS_KEY'])
    downloads = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('s3n://cs341bucket1/Data/train_app_downloads.csv').drop('')
    ratings = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('s3n://cs341bucket1/Data/train_app_rating.csv').drop('')
    usages = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('s3n://cs341bucket1/Data/train_usage.csv').drop('')
    revenues = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('s3n://cs341bucket1/Data/train_revenue.csv').drop('')
    output = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('s3n://cs341bucket1/Data/train_final_downloads.csv').drop('')
    prev_downloads = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('s3n://cs341bucket1/Data/train_cumulative_downloads_2015-02.csv').drop('')  
    release_date = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('s3n://cs341bucket1/Data/train_release_date.csv').drop('')
    text_score = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='false',inferSchema='true') \
        .load('s3n://cs341bucket1/Data/sentiment.csv').drop('')
    title_score = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='false',inferSchema='true') \
        .load('s3n://cs341bucket1/Data/t_sentiment.csv').drop('')
    avg_score = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('s3n://cs341bucket1/Data/avg_sent_score.csv').drop('')
    rating_country = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('s3n://cs341bucket1/Data/train_rating_by_country.csv').drop('')

    reviews_schema = StructType([
        StructField("id",IntegerType(),True),
        StructField("name",StringType(),True),
        StructField("country",StringType(),True),
        StructField("rating",IntegerType(),True),
        StructField("date",StringType(),True),
        StructField("title",StringType(),True),
        StructField("version",StringType(),True),
        StructField("text",StringType(),True),
        StructField("reviewer",StringType(),True)
    ])
    reviews = pd.read_csv('s3://cs341bucket1/Data/train_app_review.csv')
    reviews = sqlCtx.createDataFrame(reviews,reviews_schema)
except:
    downloads = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('train_app_downloads.csv').drop('')
    ratings = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('train_app_rating.csv').drop('')
    usages = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('train_usage.csv').drop('')
    revenues = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('train_revenue.csv').drop('')
    output = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('train_final_downloads.csv').drop('')
    prev_downloads = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('train_cumulative_downloads_2015-02.csv').drop('')  
    release_date = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('train_release_date.csv').drop('')
    text_score = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='false',inferSchema='true') \
        .load('sentiment.csv').drop('')
    title_score = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='false',inferSchema='true') \
        .load('t_sentiment.csv').drop('')
    avg_score = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('avg_sent_score.csv').drop('')
    rating_country = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('train_rating_by_country.csv').drop('')

    reviews_schema = StructType([
        StructField("id",IntegerType(),True),
        StructField("name",StringType(),True),
        StructField("country",StringType(),True),
        StructField("rating",IntegerType(),True),
        StructField("date",StringType(),True),
        StructField("title",StringType(),True),
        StructField("version",StringType(),True),
        StructField("text",StringType(),True),
        StructField("reviewer",StringType(),True)
    ])
    reviews = pd.read_csv('train_app_review.csv')
    reviews = sqlCtx.createDataFrame(reviews,reviews_schema)

#try:
#    sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", os.environ['AWS_ACCESS_KEY_ID'])
#    sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", os.environ['AWS_SECRET_ACCESS_KEY'])
downloads = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('C:\\Users/andre/Documents/StanfordMS/CS 341/Data/train_app_downloads.csv').drop('')
ratings = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('C:\\Users\\andre\\Documents\\StanfordMS\\CS 341\\Data\\train_app_rating.csv').drop('')
usages = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('C:\\Users\\andre\\Documents\\StanfordMS\\CS 341\\Data\\train_usage.csv').drop('')
revenues = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('C:\\Users\\andre\\Documents\\StanfordMS\\CS 341\\Data\\train_revenue.csv').drop('')
output = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('C:\\Users\\andre\\Documents\\StanfordMS\\CS 341\\Data\\train_final_downloads.csv').drop('')
prev_downloads = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('C:\\Users\\andre\\Documents\\StanfordMS\\CS 341\\Data\\train_cumulative_downloads_2015-02.csv').drop('')
release_date = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('C:\\Users\\andre\\Documents\\StanfordMS\\CS 341\\Data\\train_release_date.csv').drop('')
text_score = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='false',inferSchema='true') \
        .load('C:\\Users\\andre\\Documents\\StanfordMS\\CS 341\\Data\\sentiment.csv').drop('')
title_score = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='false',inferSchema='true') \
        .load('C:\\Users\\andre\\Documents\\StanfordMS\\CS 341\\Data\\t_sentiment.csv').drop('')
avg_score = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('C:\\Users\\andre\\Documents\\StanfordMS\\CS 341\\Data\\avg_sent_score.csv').drop('')

reviews_schema = StructType([
        StructField("id",IntegerType(),True),
        StructField("name",StringType(),True),
        StructField("country",StringType(),True),
        StructField("rating",IntegerType(),True),
        StructField("date",StringType(),True),
        StructField("title",StringType(),True),
        StructField("version",StringType(),True),
        StructField("text",StringType(),True),
        StructField("reviewer",StringType(),True)
    ])
reviews = pd.read_csv('C:\\Users\\andre\\Documents\\StanfordMS\\CS 341\\Data\\train_app_review.csv')
reviews = sqlCtx.createDataFrame(reviews,reviews_schema)
'''except:
    downloads = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('train_app_downloads.csv').drop('')
    ratings = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('train_app_rating.csv').drop('')
    usages = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('train_usage.csv').drop('')
    revenues = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('train_revenue.csv').drop('')
    output = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('train_final_downloads.csv').drop('')
    prev_downloads = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('train_cumulative_downloads_2015-02.csv').drop('')  
    release_date = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('train_release_date.csv').drop('')
    text_score = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='false',inferSchema='true') \
        .load('sentiment.csv').drop('')
    title_score = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='false',inferSchema='true') \
        .load('t_sentiment.csv').drop('')
    avg_score = sqlCtx.read \
        .format('com.databricks.spark.csv') \
        .options(header='true',inferSchema='true') \
        .load('avg_sent_score.csv').drop('')

    reviews_schema = StructType([
        StructField("id",IntegerType(),True),
        StructField("name",StringType(),True),
        StructField("country",StringType(),True),
        StructField("rating",IntegerType(),True),
        StructField("date",StringType(),True),
        StructField("title",StringType(),True),
        StructField("version",StringType(),True),
        StructField("text",StringType(),True),
        StructField("reviewer",StringType(),True)
    ])
    reviews = pd.read_csv('train_app_review.csv')
    reviews = sqlCtx.createDataFrame(reviews,reviews_schema)'''

In [None]:
#usage imputation
imputer = Imputer(missing_values=-1, strategy='median', axis=0)
pd_usages = usages.toPandas()
category = list(set(pd_usages["category"].values))
imp = pd.DataFrame(columns = pd_usages.columns)
for cat in category:
    #for dev in ["iphone","ipad"]:
        for metric in range(1,5):
            curr_df = pd_usages.ix[pd_usages["category"]==cat,:]
            #curr_df = curr_df.ix[curr_df["device"]==dev,:]
            curr_df = curr_df.ix[curr_df["metric"]==metric,:]
            name = curr_df.columns
            df1 = curr_df.ix[:,0:6]
            df2 = pd.DataFrame(imputer.fit_transform(curr_df.ix[:,6:]))
            df2.index = df1.index
            curr_df = pd.concat([df1,df2],axis = 1)
            curr_df.columns = name 
            imp = pd.concat([imp,curr_df],axis = 0)
usages = sqlCtx.createDataFrame(imp)

#revenue imputation
pd_revenues = revenues.toPandas()
imp = pd.DataFrame(columns = revenues.columns)
for cat in category:
    for dev in ["iphone","ipad"]:
        #for metric in range(1,5):
            curr_df = pd_revenues.ix[pd_revenues["category"]==cat,:]
            curr_df = curr_df.ix[curr_df["device"]==dev,:]
            #curr_df = curr_df.ix[curr_df["metric"]==metric,:]
            name = curr_df.columns
            df1 = curr_df.ix[:,0:5]
            df2 = pd.DataFrame(imputer.fit_transform(curr_df.ix[:,5:]))
            df2.index = df1.index
            curr_df = pd.concat([df1,df2],axis = 1)
            curr_df.columns = name 
            imp = pd.concat([imp,curr_df],axis = 0)
revenues = sqlCtx.createDataFrame(imp)



#### Generate Predictors

In [None]:
old_dateRange = pd.date_range('03/01/2015', periods=56).format(formatter=lambda x: x.strftime('%Y-%m-%d'))
dateRange = pd.date_range('03/01/2015', periods=56).format(formatter=lambda x: x.strftime('%m_%d_%Y'))
for d in range(56):
    revenues = revenues.withColumnRenamed(old_dateRange[d],dateRange[d])
    usages = usages.withColumnRenamed(old_dateRange[d],dateRange[d])
    downloads = downloads.withColumnRenamed(old_dateRange[d],dateRange[d])  
output = output.withColumnRenamed("cumulative_downloads_2016-02","cumulative_downloads_2016_02")
prev_downloads = prev_downloads.withColumnRenamed("cumulative_downloads_2015-02","cumulative_downloads_2015_02")

In [None]:
#Initialization
predictors = downloads['id','name','category','device']

In [None]:
# Generate the weekly downloads

sqlCtx.registerDataFrameAsTable(downloads, "downloads")
sqlCtx.registerDataFrameAsTable(usages, "usages")
sqlCtx.registerDataFrameAsTable(revenues, "revenues")

def get_log_week(*args):
    args = [x if not (x<0) else 0 for x in list(args)]
    nb_0 = args.count(0)
    if nb_0 == 7:
        return float(0)
    return math.log(1.0*sum(args)/(7-nb_0))
    Y = [np.log(c) for c in args if c > 0]
    if len(Y) == 0 : return 0
    return float(1.0*sum(Y))/(7-nb_0)
sqlCtx.registerFunction("get_log_week", get_log_week,returnType=FloatType())
def get_download_sum(*args):
    args = [x if not (x==-1) else 0 for x in list(args)]
    return (1.0*sum(args))
sqlCtx.registerFunction("get_download_sum", get_download_sum,returnType=FloatType())

predictors = sqlCtx.sql("SELECT id, name, category, device, \
           get_log_week("+ ",".join(dateRange[0:7])+") AS week_1 \
            ,get_log_week("+",".join(dateRange[7:14])+") AS week_2 \
            ,get_log_week("+ ",".join(dateRange[14:21])+") AS week_3 \
           ,get_log_week("+",".join(dateRange[21:28])+") AS week_4 \
           ,get_log_week("+",".join(dateRange[28:35])+") AS week_5 \
           ,get_log_week("+",".join(dateRange[35:42])+") AS week_6 \
           ,get_log_week("+",".join(dateRange[42:49])+") AS week_7 \
           ,get_log_week("+",".join(dateRange[49:56])+") AS week_8\
           ,get_download_sum("+ ",".join(dateRange)+") AS download_sum \
           from downloads")
sqlCtx.registerDataFrameAsTable(predictors, "predictors")

#I workaround the error by this modification. 
#I don't know why I couldn't run the code before but this workaround gives the same result.
#predictors = sqlCtx.sql("SELECT "+', '.join(predictors.columns)+", week_1+week_2+week_3+week_4+week_5+week_6+week_7+week_8 AS download_sum FROM predictors")


In [None]:
m1 = sqlCtx.sql("SELECT * FROM usages WHERE metric = 1")
m2 = sqlCtx.sql("SELECT * FROM usages WHERE metric = 2")
m3 = sqlCtx.sql("SELECT * FROM usages WHERE metric = 3")
m4 = sqlCtx.sql("SELECT * FROM usages WHERE metric = 4")
sqlCtx.registerDataFrameAsTable(m1,"m1")
sqlCtx.registerDataFrameAsTable(m2,"m2")
sqlCtx.registerDataFrameAsTable(m3,"m3")
sqlCtx.registerDataFrameAsTable(m4,"m4")
sqlCtx.registerDataFrameAsTable(avg_score,"avg_score")

In [None]:
# Make coefficients

def get_coefficients(*args):
    #The first element of the list is the degree of the coefficient
    #args = [x if not (x==-1) else 0 for x in list(args)]    
    #return  float(np.polyfit(range(56),np.cumsum(args[1:]),args[0])[0])
    Y = [np.log(c) for c in args[1:] if c>0]
    if len(Y)<=1: return float(0)
    return  float(np.polyfit(range(len(Y)),np.cumsum(Y),args[0])[0])
    
#Generate the step max and min 
def get_maxStep(maximum,*args):
    args = [x if not (x==-1) else 0 for x in list(args)]
    if (np.count_nonzero(args) == 0):
        return float(0)
    m = 0
    for d in range(1,56):
        if (args[d]!=0 and args[d-1]!=0):
            #c = (args[d]-args[d-1])
            c = float(args[d])/args[d-1]
            if (maximum and m < c):
                m = c
            if ( not maximum and m > c):
                m = c
    #return m
    if m==0: return float(0)
    return float(np.log(m))

def get_std(*args):
    args = [x if not (x==-1) else 0 for x in list(args)]
    return float(np.std(list(args)))

def get_nbMissing(*args):
    return list(args).count(-1)

#Generate the daily average

def get_revenue_coefficients(*args):
    #The first element of the list is the degree of the coefficient
    args = list(args)
    time_series = [np.log(c) for c in args[1:] if c>0]
    if len(time_series) <=1: return float(0)
    #return float(np.polyfit(np.array(range(56)),args[1:],args[0])[0])
    return  float(np.polyfit(np.array(range(len(time_series))),time_series,args[0])[0])

def get_usage_coefficients(*args):
    #The first element of the list is the degree of the coefficient
    args = list(args)
    if -1 in args: return 0
    return  float(np.polyfit(range(8),args[1:],args[0])[0])

def get_usage_max(*args):
    #The first element of the list is the degree of the coefficient
    args = list(args)
    args = [c for c in args[1:] if c!=-1]
    if len(args)==0: return 0
    return  float(np.max(args))

def get_usage_mean(*args):
    #The first element of the list is the degree of the coefficient
    args = list(args)
    args = [c for c in args[1:] if c!=-1]
    if len(args)==0: return 0
    return  float(np.mean(args))

def get_revenue_max(*args):
    #The first element of the list is the degree of the coefficient
    args = list(args)
    time_series = [np.log(c) for c in args[1:] if c>0]
    if len(time_series) <=1: return float(0)
    #args = [c for c in args[1:] if c!=-1]
    #if len(args)==0: return 0
    #return  float(np.max(args))
    return float(np.max(time_series))

def get_revenue_mean(*args):
    #The first element of the list is the degree of the coefficient
    args = list(args)
    #args = [c for c in args[1:] if c!=-1]
    #if len(args)==0: return 0
    #return  float(np.mean(args))
    time_series = [np.log(c) for c in args[1:] if c>0]
    if len(time_series) <=1: return float(0)
    return float(np.mean(time_series))

def get_dailyAvg(*args):
    args = [x if not (x==-1) else 0 for x in list(args)]
    if (np.count_nonzero(args) == 0):
        return float(0)
    #return  (1.0*sum(args)/np.count_nonzero(args))
    return float(sum([np.log(c) for c in args if c>0]))/np.count_nonzero(args)

sqlCtx.registerFunction("get_nbMissing", get_nbMissing,returnType=IntegerType())
sqlCtx.registerFunction("get_std", get_std,returnType=FloatType())
sqlCtx.registerFunction("get_maxStep", get_maxStep,returnType=IntegerType())
sqlCtx.registerFunction("get_maxStep", get_maxStep,returnType=FloatType())
sqlCtx.registerFunction("get_coefficients", get_coefficients,returnType=FloatType())
sqlCtx.registerFunction("daily_avg", get_dailyAvg,returnType=FloatType())
sqlCtx.registerFunction("get_usage_coefficients", get_usage_coefficients,returnType=FloatType())
sqlCtx.registerFunction("get_revenue_coefficients", get_revenue_coefficients,returnType=FloatType())
sqlCtx.registerFunction("get_usage_max", get_usage_max,returnType=FloatType())
sqlCtx.registerFunction("get_usage_mean", get_usage_mean,returnType=FloatType())
sqlCtx.registerFunction("get_revenue_max", get_revenue_max,returnType=FloatType())
sqlCtx.registerFunction("get_revenue_mean", get_revenue_mean,returnType=FloatType())

temp_downloads = sqlCtx.sql("SELECT id,name,category, device \
, get_coefficients(0,"+",".join(dateRange)+") AS coef_0 \
,get_coefficients(1,"+",".join(dateRange)+") AS coef_1 \
,get_coefficients(2,"+",".join(dateRange)+") AS coef_2 \
,get_coefficients(3,"+",".join(dateRange)+") AS coef_3 \
,get_maxStep(True,"+",".join(dateRange)+") AS max_step \
,get_maxStep(False,"+",".join(dateRange)+") AS min_step \
,get_std("+",".join(dateRange)+") AS downloads_std \
,get_nbMissing("+",".join(dateRange)+") AS nb_missing \
,daily_avg(" + ",".join(dateRange[0:56]) + ") AS daily_avg \
 FROM downloads")

predictors = predictors.join(temp_downloads,["id","name","category","device"],how='left_outer')

temp_m1 = sqlCtx.sql("SELECT id, name, category, device, \
get_usage_coefficients(0,"+",".join(m1.columns[5:13])+") AS m1_coef_0, \
get_usage_coefficients(1,"+",".join(m1.columns[5:13])+") AS m1_coef_1, \
get_usage_coefficients(2,"+",".join(m1.columns[5:13])+") AS m1_coef_2, \
get_usage_max(0,"+",".join(m1.columns[5:13])+") AS m1_max, \
get_usage_mean(0,"+",".join(m1.columns[5:13])+") AS m1_mean FROM m1")

temp_m2 = sqlCtx.sql("SELECT id, name, category, device, \
get_usage_coefficients(0,"+",".join(m2.columns[5:13])+") AS m2_coef_0, \
get_usage_coefficients(1,"+",".join(m2.columns[5:13])+") AS m2_coef_1, \
get_usage_coefficients(2,"+",".join(m2.columns[5:13])+") AS m2_coef_2, \
get_usage_max(0,"+",".join(m2.columns[5:13])+") AS m2_max, \
get_usage_mean(0,"+",".join(m2.columns[5:13])+") AS m2_mean FROM m2")

temp_m3 = sqlCtx.sql("SELECT id, name, category, device, \
get_usage_coefficients(0,"+",".join(m3.columns[5:13])+") AS m3_coef_0, \
get_usage_coefficients(1,"+",".join(m3.columns[5:13])+") AS m3_coef_1, \
get_usage_coefficients(2,"+",".join(m3.columns[5:13])+") AS m3_coef_2, \
get_usage_max(0,"+",".join(m3.columns[5:13])+") AS m3_max, \
get_usage_mean(0,"+",".join(m3.columns[5:13])+") AS m3_mean FROM m3")

temp_m4 = sqlCtx.sql("SELECT id, name, category, device, \
get_usage_coefficients(0,"+",".join(m4.columns[5:13])+") AS m4_coef_0, \
get_usage_coefficients(1,"+",".join(m4.columns[5:13])+") AS m4_coef_1, \
get_usage_coefficients(2,"+",".join(m4.columns[5:13])+") AS m4_coef_2, \
get_usage_max(0,"+",".join(m4.columns[5:13])+") AS m4_max, \
get_usage_mean(0,"+",".join(m4.columns[5:13])+") AS m4_mean FROM m4")

temp_revenues = sqlCtx.sql("SELECT id, name, category, device, \
get_revenue_coefficients(0,"+",".join(revenues.columns[4:])+") AS rev_coef_0, \
get_revenue_coefficients(1,"+",".join(revenues.columns[4:])+") AS rev_coef_1, \
get_revenue_coefficients(2,"+",".join(revenues.columns[4:])+") AS rev_coef_2, \
get_revenue_max(0,"+",".join(revenues.columns[4:])+") AS rev_max, \
get_revenue_mean(0,"+",".join(revenues.columns[4:])+") AS rev_mean FROM revenues")


predictors = predictors.join(temp_revenues,["id", "name", "category","device"],how='left_outer')
predictors = predictors.join(temp_m1,["id", "name", "category","device"],how='left_outer')
predictors = predictors.join(temp_m2,["id", "name", "category","device"],how='left_outer')
predictors = predictors.join(temp_m3,["id", "name", "category","device"],how='left_outer')
predictors = predictors.join(temp_m4,["id", "name", "category","device"],how='left_outer')
predictors = predictors.join(avg_score,["id", "name", "category","device"],how='left_outer')
#predictors = predictors.join(temp_usages,["id", "name", "category","device"],how='left_outer')
#predictors = predictors.join(,)



In [None]:
# previous downloads addition
predictors = predictors.join(prev_downloads,["id","device"],how='left_outer')

In [None]:
# Days since release generation with imputation
def get_days(date, id):
    if date == "no_date":
        # return 0
        id = 1.0*id/100000000
        return int(5856.25394104 -1731.74798728*id+195.553086*id**2  -8.12861635*id**3)
    else:
        try:
            return (datetime.datetime.strptime('03/01/2015', '%m/%d/%Y').date() \
                - datetime.datetime.strptime(date, '%m/%d/%Y').date()).days
        except:
            return (datetime.datetime.strptime('03/01/2015', '%m/%d/%Y').date() \
                - datetime.datetime.strptime(date, '%Y-%m-%d').date()).days



release_date = downloads[["id"]].dropDuplicates().join(release_date,["id"],"left").fillna("no_date",["release_date"])
sqlCtx.registerDataFrameAsTable(release_date, "release_date")
sqlCtx.registerFunction("get_days", get_days,returnType=IntegerType())
temp_date = sqlCtx.sql("SELECT id\
, get_days(release_date, id) AS days_since_release \
 FROM release_date")

predictors = predictors.join(temp_date,["id"],"left")

In [None]:
#ratings generation
sqlCtx.registerDataFrameAsTable(ratings, "ratings")
temp_ratings = sqlCtx.sql("SELECT id,name,category \
, CAST(1.0*start1/(start1+star2+star3+star4+star5) AS float) AS star1 \
, CAST(1.0*star2/(start1+star2+star3+star4+star5) AS float) AS star2 \
, CAST(1.0*star3/(start1+star2+star3+star4+star5) AS float) AS star3 \
, CAST(1.0*star4/(start1+star2+star3+star4+star5) AS float) AS star4 \
, CAST(1.0*star5/(start1+star2+star3+star4+star5) AS float) AS star5 \
, (start1+star2+star3+star4+star5) AS num_ratings \
 FROM ratings")

predictors = predictors.join(temp_ratings,["id","name","category"],"left")

In [None]:
# Categories
list_categories = [ x.category.replace(" ","_") for x in sqlCtx.sql("SELECT category \
 FROM downloads\
 group by category \
 ").collect()]
for cat in list_categories[:-1]:
    sqlCtx.registerDataFrameAsTable(predictors, "predictors")
    predictors=sqlCtx.sql('''SELECT *, CASE WHEN (category = "'''+cat+'''") THEN 1 ELSE 0 END AS '''+cat+''' FROM predictors''')


In [None]:
# Device
sqlCtx.registerDataFrameAsTable(predictors, "predictors")
predictors=sqlCtx.sql('''SELECT *, CASE WHEN (device = "iphone") THEN 1 ELSE 0 END AS iphone FROM predictors''')
sqlCtx.registerDataFrameAsTable(predictors, "predictors")
#predictors=sqlCtx.sql('''SELECT *, CASE WHEN (device = "ipad") THEN 1 ELSE 0 END AS ipad FROM predictors''')

In [None]:
lang = ['ja','zh-cn','ko','en','other']
def get_language(x):
    try:
        detected = langdetect.detect_langs(x.decode('utf8','ignore'))[0]
        if detected.prob < 0.7:
            return "other"
        elif  detected.lang in lang:
            return detected.lang
        else:
            return "other"
    except:
        return "other"
sqlCtx.registerFunction("get_language", get_language,returnType=StringType())

In [None]:
#Language of the title
for l in lang:
    sqlCtx.registerDataFrameAsTable(predictors, "predictors")
    predictors=sqlCtx.sql('''SELECT *, CASE WHEN (get_language(name) = "'''+l+'''") THEN 1 \
    ELSE 0 END AS '''+l.replace("-","_")+''' FROM predictors''')

In [None]:
#Reviews 
#escape is used in case some asshole used - or [space] anywhere
def escape(text):
    return text.replace(" ","_").replace("-","_")
# number of reviews
def get_recentReviews(date):
    return int((datetime.datetime.strptime('04/01/2015', '%m/%d/%Y').date() \
            - datetime.datetime.strptime(date, '%Y-%m-%d').date()).days >=0)

#First step
list_countries =['United_States', 'France', 'Japan', 'Spain', 'United_Kingdom','Saudi_Arabia', 'Germany'\
     , 'Hong_Kong', 'Switzerland', 'Turkey','Netherlands', 'Australia', 'Norway', 'Sweden', 'China', 'Canada'\
     ,'Tanzania', 'Denmark', 'South_Korea', 'Italy', 'Finland', 'Taiwan','Russia', 'Philippines', 'Slovenia'\
     , 'Ireland', 'Belgium', 'Mexico','Austria', 'India', 'Brazil', 'Benin', 'New_Zealand','United_Arab_Emirates'\
     , 'Ukraine', 'Poland', 'Israel', 'Portugal','Tunisia', 'Mali', 'Slovakia', 'Zimbabwe', 'Thailand', 'Panama'\
     ,'Indonesia', 'Singapore', 'Greece', 'Senegal', 'Nicaragua','Hungary', 'Czech_Republic', 'Macedonia', 'Chile'\
     , 'Uruguay','Malaysia', 'Algeria', 'Nepal', 'Mauritania', 'Croatia']

cmd = '''review_rdd = reviews\
.map(lambda x : (x.id , Row(id = x.id , avg_review = x.rating \
, recent_review = get_recentReviews(x.date), nb_review = 1\
,version = set([x.version])'''
cmd+=",country = set([x.country])"
#for c in list_countries:
#    cmd+=","+c+''' = int( escape(x.country) == "'''+c+'''")'''
for i in range(1,6):
    cmd+=",review_rating_"+str(i)+" = int(x.rating == "+str(i)+")  "
cmd+=")))"
exec cmd

#Group step
cmd = '''review_rdd = review_rdd.reduceByKey(lambda x1 ,x2 : Row(\
 avg_review = x1.avg_review + x2.avg_review\
   ,recent_review = x1.recent_review + x2.recent_review, nb_review = x1.nb_review + x2.nb_review'''
#for c in list_countries:
#    cmd+=" , "+c+" = x1."+c+" + x2."+c
cmd+=", country = x1.country.union(x2.country)"
cmd+=", version = x1.version.union(x2.version)"
for i in range(1,6):
    cmd+=", review_rating_"+str(i)+" = x1.review_rating_"+str(i)+" + x2.review_rating_"+str(i)+" "
cmd+="))"
exec cmd

# Clean the grouped rdd
cmd = '''review_rdd = review_rdd.map(lambda (id , x) : [ id \
,  1.0*x.avg_review /  x.nb_review\
   , x.recent_review,  x.nb_review'''
#for c in list_countries:
#    cmd+=" , 1.0* x."+c+"/ x.nb_review"
cmd+=",  escape(x.country.pop())"
cmd+=",  len(x.version) " # -1 if want number of updates
for i in range(1,6):
    cmd+=", 1.0*x.review_rating_"+str(i)+" / x.nb_review"
cmd+="])"
exec cmd

#Put back into dataframe
grp_reviews = sqlCtx.createDataFrame(review_rdd, ["id","avg_review"\
      ,"recent_reviews","nb_reviews","country","versions"]+["review_rating_"+str(i) for i in range(1,6)])
    #,"recent_reviews","nb_reviews"] + list_countries + ["versions"]+["review_rating_"+str(i) for i in range(1,6)])
    
#Join with predictors 
predictors = predictors.join(grp_reviews,["id"],"left").fillna("no_country",["country"])

In [None]:
# Generate DL Projection
sqlCtx.registerDataFrameAsTable(predictors, "predictors")
derived_feats = sqlCtx.sql("SELECT id, device\
    ,CAST(LOG(7*download_sum+cumulative_downloads_2015_02) AS float ) AS dl_projection \
    ,CAST((1000000.0*num_ratings/(cumulative_downloads_2015_02 + download_sum))AS float )  AS ratings_per_downloads \
    ,CAST((1.0*num_ratings/(days_since_release+60))AS float )  AS ratings_per_day \
    ,CAST((1000000.0*nb_reviews/download_sum)AS float )  AS review_per_downloads \
    ,CAST((1.0*recent_reviews/nb_reviews)AS float )  AS review_recent_over_old \
    ,CAST((1.0*cumulative_downloads_2015_02/(days_since_release+1))AS float )  AS downloads_per_day_before \
    FROM predictors")
predictors = predictors.join(derived_feats,["id","device"],"left")

#we could group by continent or use the market potential
list_countries+=["no_country"]
for co in list_countries:
    sqlCtx.registerDataFrameAsTable(predictors, "predictors")
    predictors=sqlCtx.sql('''SELECT *, CASE WHEN (country = "'''+co+'''") THEN 1 ELSE 0 END AS '''+co+''' FROM predictors''')
def get_market(country,device):
    #http://blog.nelso.com/2010/06/iphone-os-penetration-by-country.html
    if device == "iphone":
        return {
            "United_States" : 10683403,
            "France" : 2248817,
            "Japan" : 1378903,
            "Spain" : 377346,
            "United_Kingdom" : 2551128,
            "Germany" : 1117716,
            "Hong_Kong" : 299720,
            "Switzerland" : 399364,
            "Netherlands" : 372539,
            "Australia" : 1207428,
            "Norway" : 154218,
            "Sweden" : 281622,
            "China" : 725358,
            "Canada" : 919074,
            "Denmark" : 151426,
            "Italy" : 648718,
            "Taiwan" : 174226,
            "Mexico" : 215326,
            "Austria" : 156322,
            "Brazil" : 219339,
            "Poland" : 72114,
            "Singapore" : 402922,
            "Hungary" : 33219,
            "Czech_Republic" : 42753,
            'South_Korea': 530235,
            "Russia" :246421   
    }.get(country, 15000) 
    else:
        return{
            "United_States" : 223269,
            "France" : 2724,
            "Japan" : 2293,
            "Spain" : 1494,
            "United_Kingdom" : 4197,
            "Germany" : 3403,
            "Hong_Kong" : 2306,
            "Switzerland" : 1698,
            "Netherlands" : 2554,
            "Australia" : 1400,
            "Norway" : 1333,
            "Sweden" : 1188,
            "China" : 12516,
            "Canada" : 6275,
            "Denmark" : 753,
            "Italy" : 1370,
            "Taiwan" : 1356,
            "Mexico" : 3380,
            "Austria" : 493,
            "Brazil" : 2014,
            "Poland" : 324,
            "Singapore" : 1453,
            "Hungary" : 211,
            "Czech_Republic" : 203,
            'South_Korea': 2416,
            "Russia" :2183
        }.get(country, 100) 
    
    
    return 1
sqlCtx.registerFunction("get_market", get_market,returnType=IntegerType())

sqlCtx.registerDataFrameAsTable(predictors, "predictors")
predictors=sqlCtx.sql('''SELECT *, get_market(country,device) AS market_size FROM predictors''')

In [None]:
sqlCtx.registerDataFrameAsTable(rating_country, "rating_country")
rating_country = sqlCtx.sql('SELECT id\
, SUM(star1) AS total_star1\
, SUM(star2) AS total_star2\
, SUM(star3) AS total_star3\
, SUM(star4) AS total_star4\
, SUM(star5) AS total_star5\
, COUNT(1) AS nb_countries FROM rating_country GROUP BY id')
sqlCtx.registerDataFrameAsTable(rating_country, "rating_country")

predictors = predictors.join(rating_country,["id"],how='left_outer')

In [None]:
'''#Taking the log
to_log = ["download_sum","cumulative_downloads_2015_02","dl_projection"]
def get_log(x) : 
    try:
        return math.log(x)
    except:
        return 0
sqlCtx.registerFunction("get_log", get_log,returnType=FloatType())

for name in to_log:
    cmd = 'SELECT '+','.join(filter(lambda x : x not in  to_log ,predictors.columns))
    for t in to_log:
        cmd+= " , get_log("+t+") AS "+t
    cmd+=" FROM predictors"
    sqlCtx.registerDataFrameAsTable(predictors, "predictors")
    predictors=sqlCtx.sql(cmd)'''

In [None]:
predictors = predictors.fillna(0)

In [None]:
predictors.cache()

In [None]:
predictors.head()

In [None]:
predictors.toPandas().to_csv("predictors_spark.csv", sep='\t',encoding='utf-8')

In [None]:
#to delete
predictors = sqlCtx.read \
    .format('com.databricks.spark.csv') \
    .options(header='true',inferSchema='true') \
    .load('predictors_spark.csv').drop('')

#### ML  pipeline

In [None]:
predictor_names = predictors.rdd.top(1)[0].asDict().keys()
predictor_names.remove("name")
predictor_names.remove("id")
predictor_names.remove("category")
predictor_names.remove("device")
predictor_names.remove("country")

predictors_labelPoints = predictors.rdd\
.map(lambda x: ((x.id,x.device) ,x))\
.join(output.rdd.map(lambda x: ((x.id,x.device) ,x.cumulative_downloads_2016_02)))\
.map(lambda x : (x[0] , LabeledPoint(x[1][1], [x[1][0].asDict()[col] for col in predictor_names])))

predictors_labelPoints.cache()
predictors_labelPoints.top(1)

In [None]:
def get_preformance_old(labelsAndPredictions):
    total = labelsAndPredictions.count()
    pred_threshold = labelsAndPredictions.takeOrdered(int(0.01*total),lambda x:-x.pred)[-1].pred
    true_threshold = labelsAndPredictions.takeOrdered(int(0.01*total),lambda x:-x.true)[-1].true
    pred_top = labelsAndPredictions.filter(lambda x: x.pred >= pred_threshold ).map(lambda x : ((x.id,x.device),1))
    true_top = labelsAndPredictions.filter(lambda x: x.true >= true_threshold ).map(lambda x : ((x.id,x.device),1)) 
    
    if pred_top.count() > total*0.01 :  print "Error "+str(pred_top.count()) + "   "+str(total*0.01)
    
    return 10000.0*pred_top.join(true_top).count()/total


def get_preformance(labelsAndPredictions):
    total = labelsAndPredictions.count()
    pred_top = labelsAndPredictions.takeOrdered(int(0.01*total),lambda x:-x.pred).map(lambda x : ((x.id,x.device),1))
    true_top = labelsAndPredictions.takeOrdered(int(0.01*total),lambda x:-x.true).map(lambda x : ((x.id,x.device),1))
    
    if pred_top.count() > total*0.01 : return print "Error "+str(pred_top.count()) + "   "+str(total*0.01)
    
    return 10000.0*pred_top.join(true_top).count()/total

In [None]:
K = 5
predictors_cvSplit = predictors_labelPoints.randomSplit([1.0/K] * K)

for fold in range(K):
    test_rdd = predictors_cvSplit[fold]
    train_rdd = sc.union([predictors_cvSplit[i]  for i in range(K) if (i!= fold)])
    
    model = GradientBoostedTrees.trainRegressor(train_rdd.map(lambda x:x[1])\
                    ,categoricalFeaturesInfo={predictor_names.index("categorical_country"): 1+len(list_countries)\
                              ,predictor_names.index("categorical_category"):1+len(list_categories)}\
                                                , numIterations=100\
                                               ,maxDepth=1\
                                               ,learningRate = 0.1\
                                               ,maxBins = 100)
    
    prediction = model.predict(test_rdd.map(lambda x:x[1].features))
    labelsAndPredictions = test_rdd.map(lambda x: x[1].label)\
    .zip(prediction)\
    .zip(test_rdd.map(lambda x : x[0]))\
    .map(lambda (tuple_true_pred , tuple_id) :\
         Row(id = tuple_id[0], device = tuple_id[1] , pred = tuple_true_pred[1],true = tuple_true_pred[0]))
    #print get_preformance(labelsAndPredictions)
    print get_preformance_old(labelsAndPredictions)
    
    

In [None]:
predictor_names

In [None]:
   
    

print list(predictors.columns.values)
np.random.seed(1)
kf = KFold(len(predictors), n_folds=K)
new_top = []
for train, test in kf:

    old_mod=linear_model.LinearRegression(fit_intercept=False).fit(predictors.as_matrix()[train,4:12], output.as_matrix()[train,5])
    y_pred =  old_mod.predict(predictors.as_matrix()[test,4:12])


    #model
    #mod=linear_model.LinearRegression(fit_intercept=False)\ 
    #mod=linear_model.Lasso(alpha=100,fit_intercept=False)\
    #mod = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0, loss='ls')\ 
    #mod= RandomForestRegressor(max_features = 1.0/3.0,n_estimators = 100)\
    #mod= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100)\
    
    #mod = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01,max_depth=2, random_state=0, loss='ls')\
    #.fit(predictors.as_matrix()[train,4:], output.as_matrix()[train,5]) 

    #y_pred =  mod.predict(predictors.as_matrix()[test,4:])
    new_top.append(metric(y_pred,output.as_matrix()[test,5]))

print 1.0*sum(new_top)/len(new_top)

In [None]:
predictor_names

In [None]:
LabeledPoint(1,[2])

In [None]:
#Use something like this in the classification step
def even(x): return x % 2 == 0
def odd(x): return not even(x)
rdd = sc.parallelize(range(20))

rdd_odd, rdd_even = (rdd.filter(f) for f in (odd, even))

## To Pandas


In [None]:
output

In [None]:

predictors_pd = pd.read_csv('predictors_spark.csv',sep='\t').drop('Unnamed: 0', 1).fillna(0).sort_values(by="id")

In [None]:
predictors_pd = predictors.toPandas().sort_values(by="id").fillna(0)

In [None]:
predictors_pd = predictors_pd.sort_values(["id","device"])
output_pd = pd.merge(predictors_pd,output.toPandas(),on = ["id","device"],how="left")\
[["id","device", "cumulative_downloads_2016_02"]]

#predictors_pd["id"]=(predictors_pd["id"]).astype(str)+predictors_pd["device"]

numerical_predictors = list(predictors_pd.columns.values)
numerical_predictors.remove("id");
numerical_predictors.remove("device");
numerical_predictors.remove("category");
numerical_predictors.remove("name");
numerical_predictors.remove("country");


#numerical_predictors.remove("downloads_std");
#numerical_predictors.remove("nb_missing");
#numerical_predictors.remove("max_step");

predictors_id = predictors_pd[["id","device"]].as_matrix()

predictors_matrix = predictors_pd[numerical_predictors].as_matrix()


#output_pd["id"]=(output_pd["id"]).astype(str)+output_pd["device"]
output_matrix = output_pd["cumulative_downloads_2016_02"].as_matrix()

#predictors_matrix

In [None]:
#This is the metric we use to determine our performance
def metric(y_pred,y_test,percent=1):
    top = int(len(y_pred)/100.0*percent)
    return (len(set([i[0] for i in sorted(enumerate(y_pred), key=lambda x:x[1],reverse=True)][0:top])
       .intersection([i[0] for i in sorted(enumerate(y_test), key=lambda x:x[1],reverse=True)][0:top])
               ))/(percent/100.0)/top

In [None]:
numerical_perdictors

In [None]:
for c in [0.1,1.0,10,100,1000]:
    for e in [0.001,0.01,0.1,1.0,10,100,1000]:
        K = 5
        #print list(numerical_predictors)
        np.random.seed(4)
        kf = KFold(len(predictors_pd), n_folds=K,shuffle=True)
        new_top = []
        for train, test in kf:
            train_features = predictors_pd[numerical_predictors].as_matrix()[train,:]
            train_output = output_pd["cumulative_downloads_2016_02"].as_matrix()[train]

            test_features = predictors_pd[numerical_predictors].as_matrix()[test,:]
            test_output = output_pd["cumulative_downloads_2016_02"].as_matrix()[test]


            #mod = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=2, random_state=0, loss='ls')
            #mod = linear_model.Lasso(alpha=100)
            mod = SVR(kernel='rbf', degree=3, gamma='auto', coef0=0.0, tol=0.001, C=1.0\
                      , epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=100)
            mod.fit(train_features, train_output )
            y_pred = mod.predict(test_features)

            err = (metric(test_output,y_pred))
            new_top.append(err)
            #print err

        print "SVR for c= \t"+str(c)+" and e =\t "+str(e)+"  \t -->  "+str(1.0*sum(new_top)/len(new_top))

In [None]:

K = 5

np.random.seed(4)
kf = KFold(len(predictors_pd), n_folds=K,shuffle=True)
new_top = []
for train, test in kf:
    train_features = predictors_pd[numerical_predictors].as_matrix()[train,:]
    train_output = output_pd["cumulative_downloads_2016_02"].as_matrix()[train]

    #train_order =(range(len(train)))
    #train_order.sort(key=lambda x: predictors_pd[["id"]].iloc(x))
    
    #train_features = train_features[train_order].astype(float)
    #train_output = train_output[train_order]
    
    train_features_1 = train_features[:int(3.0*len(test)/5)]
    train_features_2 = train_features[int(3.0*len(test)/5):]
    train_output_1 = train_output[:int(3.0*len(test)/5)]
    train_output_2 = train_output[int(3.0*len(test)/5):]
    
    test_features = predictors_pd[numerical_predictors].as_matrix()[test,:]
    test_output = output_pd["cumulative_downloads_2016_02"].as_matrix()[test]
    
    #test_order =(range(len(test)))
    #test_order.sort(key=lambda x: predictors_pd[["id"]].iloc(x))
    
    #test_features = test_features[test_order].astype(float)
    #test_output = test_output[test_order]
    
    test_features_1 = test_features[:int(4.0*len(test)/5)]
    test_features_2 = test_features[int(4.0*len(test)/5):]
    test_output_1 = test_output[:int(4.0*len(test)/5)]
    test_output_2 = test_output[int(4.0*len(test)/5):]
    

    mod_1 = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01,max_depth=2, random_state=0, loss='ls')\
    .fit(train_features_1, train_output_1 )
    mod_2 = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01,max_depth=2, random_state=0, loss='ls')\
    .fit(train_features_2, train_output_2 )

    mod = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01,max_depth=2, random_state=0, loss='ls')\
    .fit(train_features, train_output )
    
    y_pred_1 = mod_1.predict(test_features_1)
    y_pred_2 = mod_2.predict(test_features_2)

    
    y_pred = np.concatenate((y_pred_1,y_pred_2))
    
    y_pred_old = mod.predict(test_features)
    
    err = (metric(test_output,y_pred))
    new_top.append(err)
    print err
    print "old "+str(metric(test_output,y_pred_old))

print 1.0*sum(new_top)/len(new_top)

In [None]:
split in 2 64.3185961284
No split sorted 64.6278585376
no split unsorted 

In [None]:
for s in range(100):
    np.random.seed(s)
    K = 5

    top_percent_classif = 20


    kf = KFold(len(predictors_pd), n_folds=K,shuffle=True)
    new_top = []
    top_10 = []
    for train, test in kf:
        train_features = predictors_pd[numerical_predictors].as_matrix()[train,:]
        train_output = output_pd[["cumulative_downloads_2016_02"]].as_matrix()[train,0]
        test_features = predictors_pd[numerical_predictors].as_matrix()[test,:]
        test_output = output_pd[["cumulative_downloads_2016_02"]].as_matrix()[test,0]

        '''true_mask = test_output > sorted(test_output,reverse = True)[int(1.0/100*len(test))]

        #Classifications
        mod_class= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100)   
        mod_class.fit(train_features, train_output)
        y_pred =  mod_class.predict(test_features)
        mask_1 = y_pred > (sorted(y_pred,reverse = True))[int(top_percent_classif*1.0/100*len(test))]

        mod_class = linear_model.Lasso(alpha=100)
        mod_class.fit(train_features, train_output)
        y_pred =  mod_class.predict(test_features)
        mask_2 = y_pred > (sorted(y_pred,reverse = True))[int(top_percent_classif*1.0/100*len(test))]

        mask = mask_1*mask_2


        print "top classif 1: "+str(100.0*sum(true_mask*mask_1)/sum(true_mask))
        print "top classif 2: "+str(100.0*sum(true_mask*mask_2)/sum(true_mask))
        print "top classif inter: "+str(100.0*sum(mask_1*mask_2)/sum(mask_1))'''
        mask = np.ones(len(test))

        threshold_20 = sorted(train_output,reverse = True)[int(len(train)/5)]
        threshold_10 = sorted(train_output,reverse = True)[int(len(train)/10)]
        threshold_5 = sorted(train_output,reverse = True)[int(len(train)/20)]
        threshold_1 = sorted(train_output,reverse = True)[int(len(train)/20)]
        threshold_0 = 0

        relevance =(range(len(train)))
        relevance.sort(key=lambda x: (train_output[x]))
        relevance = (np.asarray(relevance)-int(99.0*len(train)/100))
        relevance[relevance<0] = 0
        
        mod_train = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01,max_depth=2, random_state=0, loss='ls')\
        .fit(train_features,np.log(train_output))
        train_pred =  mod_top1.predict(train_features)
        
        pred_train_mask = train_pred > sorted(train_pred,reverse = True)[int(1.0/100*len(train))]
        true_train_mask = train_output > sorted(train_output,reverse = True)[int(1.0/100*len(train))]
        hard_class = np.logical_xor(pred_train_mask,true_train_mask)
        

        weights = 1*(train_output >  threshold_0).astype(int)\
        +1*(train_output >  threshold_20).astype(int)\
        +0*(train_output >  threshold_10).astype(int)\
        +0*(train_output >  threshold_5).astype(int)\
        + 1*(train_output >  threshold_1).astype(int)\
        + 0*relevance\
        + 1*hard_class

        mod_top1 = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01,max_depth=2, random_state=0, loss='ls')\
        .fit(train_features,np.log(train_output) ,sample_weight \
        =(weights)*1.0/sum(weights))

        y_pred_2 =  mod_top1.predict(test_features)*mask
        estimation_error = metric(y_pred_2,test_output)
        new_top.append(estimation_error)
        print estimation_error
    print "____"
    print "Top1% with classif1 : " + str(1.0*sum(new_top)/len(new_top))
    print "____"

In [None]:
64.0625
67.1875
70.3125
64.0625
71.875
Top10%              : 99.6875
Top1% with classif1 : 67.5
    
    
    20 + 1
65.625
67.1875
70.3125
64.0625
71.875
Top10%              : 91.25
Top1% with classif1 : 67.8125    

In [None]:
print len(estimate_top1)
print len(test)
print len(output_test_top_1_precent)
estimate_top1.merge(output_test_top_1_precent, how='inner', on=["id","device"])

In [None]:
len(estimate_top1.merge(output_test_top_1_precent, how='inner', on=["id","device"]))
print 46.0/64

In [None]:
metric(y_pred_2,test_output)
1.0*len(inter)/len(lst)

percent = 1
top = int(len(y_pred_2)/100.0*percent)
print (len(set([i[0] for i in sorted(enumerate(y_pred_2), key=lambda x:x[1],reverse=True)][0:top])
   .intersection([i[0] for i in sorted(enumerate(test_output), key=lambda x:x[1],reverse=True)][0:top])
           ))*1.0/(percent/100.0)/top

print len(set([i[0] for i in sorted(enumerate(y_pred_2), key=lambda x:x[1],reverse=True)][0:top]))
print len([i[0] for i in sorted(enumerate(test_output), key=lambda x:x[1],reverse=True)][0:top])
print (len(set([i[0] for i in sorted(enumerate(y_pred_2), key=lambda x:x[1],reverse=True)][0:top])
   .intersection([i[0] for i in sorted(enumerate(test_output), key=lambda x:x[1],reverse=True)][0:top])
           ))/64.0
print (1.0/(1.0/100.0)/len(y_pred_2)*100)

In [None]:
inter = []
for x in [str(y[0])+str(y[1]) for y in lst_true]:
    if x in [str(y[0])+str(y[1]) for y in lst]:
        inter.append( x)
print len(inter)

In [None]:
thres = sorted(y_pred_2,reverse=True)[int(len(test)*1.0/100)]
lst= []
for x in enumerate(y_pred_2):
    if x[1]>thres:
        lst.append( predictors_id[test[x[0]]])
        
thres = sorted(test_output,reverse=True)[int(len(test)*1.0/100)]
lst_true= []
for x in enumerate(test_output):
    if x[1]>thres:
        lst_true.append( predictors_id[test[x[0]]])


In [None]:
print len((test_output))

In [None]:
for nb_tree in [10,50,100,500,1000,5000,10000]:
    np.random.seed(1)
    K = 5

    top_percent_classif = 20



    kf = KFold(len(predictors_pd), n_folds=K,shuffle=True)
    new_top = []
    top_10 = []
    for train, test in kf:

        mod_class10= RandomForestRegressor(max_features = 'sqrt',n_estimators = 100)\
        .fit(predictors_pd[numerical_predictors].as_matrix()[train,:], output_pd[["cumulative_downloads_2016_02"]].as_matrix()[train,0])

        y_pred =  mod_class10.predict(predictors_pd[numerical_predictors].as_matrix()[test,:])

        #estimate of the top 10% of the test set
        estimate_class10= predictors_pd.iloc[test].copy()
        estimate_class10["firstEstimate"] = y_pred
        estimate_class10 = estimate_class10.sort_values(by= "firstEstimate",ascending = False).iloc[0:int(1.0*top_percent_classif/100.0*len(estimate_class10))]
        estimate_class10 = estimate_class10.drop("firstEstimate",1)
        #estimate_class10 = estimate_class10.sort_values(by= "daily_avg",ascending = False).iloc[0:int(1.0*top_percent_classif/100.0*len(estimate_class10))]


        #top 10% of the trainning set
        output_train_top_10_precent = output_pd.iloc[train].copy().sort_values(by= 'cumulative_downloads_2016_02',ascending = False).iloc[0:int(1.0*top_percent_classif/100.0*len(output_pd.iloc[train]))]
        predictor_train_top_10_precent = output_train_top_10_precent.merge(predictors_pd, how='left', on=["id","device"]).copy()
        predictor_train_top_10_precent = predictor_train_top_10_precent.drop('cumulative_downloads_2016_02',1)
        #predictor_train_top_10_precent = predictor_train_top_10_precent.drop('firstEstimate',1)

        #This is the actual top 1% of the test set
        output_test_top_1_precent = output_pd.iloc[test].sort_values(by= 'cumulative_downloads_2016_02',ascending = False).iloc[0:int(0.01*len(output_pd.iloc[test]))].copy()


        #second model -> Regression on the top obtainned by regression
        mod_top1 = GradientBoostingRegressor(n_estimators=nb_tree, learning_rate=0.015,max_depth=2, random_state=0, loss='ls')\
        .fit(predictor_train_top_10_precent[numerical_predictors].as_matrix(), np.log(output_train_top_10_precent["cumulative_downloads_2016_02"].as_matrix()))


        y_pred_2 =  mod_top1.predict(estimate_class10[numerical_predictors].as_matrix())

        estimate_top1 = estimate_class10.copy()
        estimate_top1["secondEstimate"] = y_pred_2
        estimate_top1 = estimate_top1.sort_values(by= "secondEstimate",ascending = False).iloc[:int(0.01*len(output_pd.iloc[test]))]

        #estimate_top1_select = estimate_top1_select.sort_values(by= "thirdEstimate",ascending = False).iloc[0:int(0.01*len(output.iloc[test]))]
        #estimate_top1_noClassif = estimate_top1_noClassif.sort_values(by= "noClassifEstimate",ascending = False).iloc[0:int(0.01*len(output.iloc[test]))]

        estimation_error = len(estimate_top1.merge(output_test_top_1_precent, how='inner', on=["id","device"]))*100.0/len(output_test_top_1_precent)
        new_top.append(estimation_error)
        top_10.append(len(estimate_class10.merge(output_test_top_1_precent, how='inner', on=["id","device"]))*100.0/len(output_test_top_1_precent))

    #print "Old model           : " + str(1.0*sum(old_top)/len(old_top))
    print nb_tree
    print "Top10%              : " + str(1.0*sum(top_10)/len(top_10))
    print "Top1% with classif1 : " + str(1.0*sum(new_top)/len(new_top))
    print "______"
    #print "Top1% with classif2 : " + str(1.0*sum(new_top_select)/len(new_top_select))
    #print "Top1% no classif1   : " + str(1.0*sum(new_top_noClassif)/len(new_top_noClassif))

In [None]:
predictors_pd.drop(list_categories,1).sort_values(["id","device"])[3456:3500]

In [None]:
old_predictors.drop(list_categories,1).sort_values(["id","device"])[3456:3500]

In [None]:
predictors_pd.drop(list_categories,1).sort_values(["id","device"])["market_size"]

In [None]:
old_predictors = pd.read_csv('predictors.csv').drop('Unnamed: 0', 1).fillna(0)\
.rename(columns={'maxStep': 'max_step', 'minStep': 'min_step','std':'downloads_std','Health and Fitness':'Health_and_Fitness'\
                ,'versions':'num_versions', 'cumulative_downloads_2015-02':'cumulative_downloads_2015_02'\
                ,'Social Networking':'Social_Networking', 'Food and Drink':'Food_and_Drink' ,'Photo and Video':'Photo_and_Video'})

In [None]:
old_list = list(old_predictors.columns.values)
new_list = (predictors_pd.columns.values)
print old_list

In [None]:
for n in new_list:
    try:
        print str(n) + " : "+str((sum(np.absolute(predictors_pd.sort_values(["id","device"])[n].as_matrix()\
                                                 -old_predictors.sort_values(["id","device"])[n].as_matrix())))\
                                /sum(np.absolute(predictors_pd.sort_values(["id","device"])[n].as_matrix())))
    except:
        print "error on: "+str(n)
            

In [None]:
plt.scatter(predictors_pd[predictors_pd.days_since_release != 0].sort_values(["id","device"])["id"].as_matrix()\
     ,predictors_pd[predictors_pd.days_since_release != 0].sort_values(["id","device"])["days_since_release"].as_matrix()
           )

In [None]:
temp = 3

train_days = np.power(\
                      np.matlib.repmat(predictors_pd[predictors_pd.days_since_release != 0]\
                              .sort_values(["id","device"])[["id"]].as_matrix()*1.0/100000000,1,temp+1),np.array(range(temp+1)))

test_days = predictors_pd[predictors_pd.days_since_release != 0]\
                              .sort_values(["id","device"])[["days_since_release"]].as_matrix()
    
day_mod=linear_model.LinearRegression(fit_intercept=False).fit(train_days,test_days )

plt.scatter(np.concatenate((train_days[:,1],train_days[:,1])), \
        np.concatenate((test_days[:,0],np.sum(np.dot(train_days,\
          np.diag(day_mod.coef_[0,:])),axis=1))),\
       c=np.concatenate((np.ones(len(train_days)),2*np.ones(len(train_days)))),\
       alpha=.1)

print day_mod.coef_

In [None]:
day_mod=linear_model.LinearRegression(fit_intercept=False).fit(train_days,test_days )

In [None]:
day_mod.coef_


In [None]:
plt.scatter(np.concatenate((train_days[:,1],train_days[:,1])), \
            np.concatenate((test_days[:,0],np.sum(np.dot(train_days,\
              np.diag(day_mod.coef_[0,:])),axis=1))),\
           c=np.concatenate((np.ones(len(train_days)),2*np.ones(len(train_days)))),\
           alpha=.1)

In [None]:
np.sum(np.dot(train_days,np.diag([  1.73879033e-24,  -8.16149354e-06,   4.19123763e-15,\
         -2.24536991e-16,  -2.24586827e-16])),axis=1)

In [None]:
np.concatenate((test_days[:,0],np.sum(np.dot(train_days,\
              np.diag([  1.73879033e-24,  -8.16149354e-06,   4.19123763e-15,\
         -2.24536991e-16,  -2.24586827e-16])),axis=1))) 

In [None]:
(np.ones(len(train_days)),2*np.ones(len(train_days)))

In [None]:
np.transpose(test_days)

In [None]:
np.concatenate((train_days[:,1],train_days[:,1]))

In [None]:
train_days[0,:]

In [None]:
train_days = np.power(\
                      np.matlib.repmat(predictors_pd[predictors_pd.days_since_release != 0]\
                              .sort_values(["id","device"])[["id"]].as_matrix()*1.0/100000000,1,temp+1),np.array(range(temp+1)))

In [None]:
281704574**3

In [None]:
predictors_pd.sort_values(["id","device"])[["nb_reviews","cumulative_downloads_2015_02","review_per_downloads","review_per_day"]]

In [None]:
old_predictors.sort_values(["id","device"])[["num_review","cumulative_downloads_2015-02","review_per_downloads"]]

In [None]:
predictors_pd

In [None]:
mem = pd.read_csv('../iphones_per_country.csv').fillna(0)


In [None]:
int(mem[mem.Country == "Mexico"]["iPhone"])

In [None]:
mem["Country"]

In [None]:
int(mem[mem.Country == "Mexico"]["iPhone"])

In [None]:
s=[]
for l in list_countries:
    try:
        print '''"'''+l+'''" : '''+str(int(mem[mem.Country == l.replace("_"," ")]["iPad"]))+","
    except:
        s.append(l)
        t+=1
print s


In [None]:
predictors_pd.sort_values(["id","device"])[["id"]].as_matrix()[:,0]

In [None]:
print predictors_pd.sort_values(["id","device"])[["id"]].as_matrix()[[int(32339.0/5),int(2*32339.0/5),int(3*32339.0/5),int(4*32339.0/5)],0]

plt.scatter(predictors_pd.sort_values(["id","device"])[["id"]].as_matrix()[:,0],\
        output_pd.sort_values(["id","device"])[["cumulative_downloads_2016_02"]].as_matrix()[:,0]\
       ,alpha=.1\
           ,c = 2*(output_pd.sort_values(["id","device"])[["cumulative_downloads_2016_02"]].as_matrix()[:,0]\
                   >sorted(output_pd['cumulative_downloads_2016_02'])\
            [int(len(output_pd['cumulative_downloads_2016_02'])*0.99)])\
             )

In [None]:
for train,test in t:
    print test[0]

In [None]:
predictors_pd.sort_values(["id","device"])[["id"]].as_matrix()

In [None]:
#!/usr/bin/env python

import numpy as np
import joblib
from sklearn.ensemble import GradientBoostingRegressor

np.seterr(all='raise')

labels = []
features = []
f = open('data/train.txt')
for line in f:
    # strip off comments
    line = line[:line.find('#') - 1]
    ls = line.split()
    labels.append(int(ls[0]))
    features.append([float(x[x.find(':') + 1:]) for x in ls[1:]])
f.close()

labels = np.asarray(labels, dtype=np.int32)
features = np.asarray(features)

# to test with gbm
np.savetxt('labels.csv', labels, delimiter=',')
np.savetxt('features.csv', features, delimiter=',')

query = features[:, 0]
features = features[:, 1:]

gb = GradientBoostingRegressor(loss='ndcg', learning_rate=0.1,
                               n_estimators=5, max_depth=4, verbose=2, subsample=0.5,
                               random_state=1)
print(gb)
gb.fit(features, labels, query)

joblib.dump(gb, 'gb.pkl')

In [None]:
import math
import numpy as np
import math
# import pandas
from optparse import OptionParser
from sklearn.tree import DecisionTreeRegressor
from collections import defaultdict
from copy import deepcopy
from multiprocessing import Pool
from itertools import chain
import time

class Ensemble:
    def __init__(self, rate):
        self.trees = []
        self.rate = rate

    def __len__(self):
        return len(self.trees)

    def add(self, tree):
        self.trees.append(tree)

    def eval_one(self, object):
        return self.eval([object])[0]

    def eval(self, objects):
        results = np.zeros(len(objects))
        for tree in self.trees:
            results += tree.predict(objects) * self.rate
        return results

    def remove(self, number):
        self.trees = self.trees[:-number]


def groupby(score, query):
    result = []
    this_query = None
    this_list = -1
    for s, q in zip(score, query):
        if q != this_query:
            result.append([])
            this_query = q
            this_list += 1
        result[this_list].append(s)
    result = map(np.array, result)
    return result


def compute_point_dcg(arg):
    rel, i = arg
    return (2 ** rel - 1) / math.log(i + 2, 2)


def compute_point_dcg2(arg):
    rel, i = arg
    if i == 0:
        return rel
    else:
        return rel / (math.log(1 + i, 2))
    return


def compute_dcg(array):
    dcg = map(compute_point_dcg, zip(array, range(len(array))))
    return sum(dcg)


def compute_ndcg(page, k=10):
    idcg = compute_dcg(np.sort(page)[::-1][:k])
    dcg = compute_dcg(page[:k])

    if idcg == 0:
        return 1

    return dcg / idcg


def ndcg(prediction, true_score, query, k=10):
    true_pages = groupby(true_score, query)
    pred_pages = groupby(prediction, query)

    total_ndcg = []
    for q in range(len(true_pages)):
        total_ndcg.append(compute_ndcg(true_pages[q][np.argsort(pred_pages[q])[::-1]], k))
    return sum(total_ndcg) / len(total_ndcg)


def query_lambdas(page):
    true_page, pred_page = page
    worst_order = np.argsort(true_page)
    true_page = true_page[worst_order]
    pred_page = pred_page[worst_order]

    page = true_page[np.argsort(pred_page)]
    idcg = compute_dcg(np.sort(page)[::-1])
    position_score = np.zeros((len(true_page), len(true_page)))

    for i in xrange(len(true_page)):
        for j in xrange(len(true_page)):
            position_score[i, j] = compute_point_dcg((page[i], j))

    lambdas = np.zeros(len(true_page))

    for i in xrange(len(true_page)):
        for j in xrange(len(true_page)):
                if page[i] > page[j]:

                    delta_dcg = position_score[i][j] - position_score[i][i]
                    delta_dcg += position_score[j][i] - position_score[j][j]

                    delta_ndcg = abs(delta_dcg / idcg)

                    rho = 1 / (1 + math.exp(page[i] - page[j]))
                    lam = rho * delta_ndcg

                    lambdas[i] -= lam
                    lambdas[j] += lam
    return lambdas


def compute_lambdas(prediction, true_score, query, k=10):
    true_pages = groupby(true_score, query)
    pred_pages = groupby(prediction, query)

    print len(true_pages), "pages"

    pool = Pool()
    lambdas = pool.map(query_lambdas, zip(true_pages, pred_pages))
    return list(chain(*lambdas))


def mart_responces(prediction, true_score):
    return true_score - prediction


def learn(features,scores, queries = "no_query", n_trees=10, learning_rate=0.1, k=10, verbiose = False):
    if verbiose : print "Loading train file"
    #train = np.loadtxt(train_file, delimiter=",", skiprows=1)
    #validation = np.loadtxt(validation_file, delimiter=",", skiprows=1)

    #I should replace this input
    
    #scores = train[:, 0]
    #val_scores = train[:, 0]

    #queries = train[:, 1]
    #val_queries = validation[:, 1]

    #features = train[:, 3:]
    #val_features = validation[:, 3:]
    
    if queries == "no_query":
        quieries = np.ones(len(scores))

    ensemble = Ensemble(learning_rate)

    if verbiose : print "Training starts..."
    model_output = np.array([float(0)] * len(features))
    val_output = np.array([float(0)] * len(validation))

    # print model_output
    # best_validation_score = 0
    time.clock()
    for i in range(n_trees):
        if verbiose :print " Iteration: " + str(i + 1)

        # Compute psedo responces (lambdas)
        # witch act as training label for document
        start = time.clock()
        if verbiose :print "  --generating labels"
        #lambdas = compute_lambdas(model_output, scores, queries, k)
        lambdas = mart_responces(model_output, scores)
        if verbiose :print "  --done", str(time.clock() - start) + "sec"

        # create tree and append it to the model
        if verbiose :print "  --fitting tree"
        start = time.clock()
        tree = DecisionTreeRegressor(max_depth=2)
        # print "Distinct lambdas", set(lambdas)
        tree.fit(features, lambdas)

        if verbiose :print "  ---done", str(time.clock() - start) + "sec"
        if verbiose :print "  --adding tree to ensemble"
        ensemble.add(tree)

        # update model score
        if verbiose :print "  --generating step prediction"
        prediction = tree.predict(features)
        # print "Distinct answers", set(prediction)

        if verbiose :print "  --updating full model output"
        model_output += learning_rate * prediction
        # print set(model_output)

        # train_score
        start = time.clock()
        if verbiose :print "  --scoring on train"
        train_score = ndcg(model_output, scores, queries, k)
        if verbiose :print "  --iteration train score " + str(train_score) + ", took " + str(time.clock() - start) + "sec to calculate"

        # validation score
        #if verbiose :print "  --scoring on validation"
        #val_output += learning_rate * tree.predict(val_features)
        #val_score = ndcg(val_output, val_scores, val_queries, k)

        if verbiose :print "  --iteration validation score " + str(val_score)

        # if(validation_score > best_validation_score):
        #         best_validation_score = validation_score
        #         best_model_len = len(ensemble)

        # # have we assidently break the celling?
        # if (best_validation_score > 0.9):
        #     break

    # rollback to best
    # if len(ensemble) > best_model_len:
        # ensemble.remove(len(ensemble) - best_model_len)

    # finishing up
    if verbiose :print "final quality evaluation"
    # train_score = compute_ndcg(ensemble.eval(features), scores)
    # test_score = compute_ndcg(ensemble.eval(validation), validation_score)

    # print "train %s, test %s" % (train_score, test_score)
    if verbiose :print "Finished sucessfully."
    if verbiose :print "------------------------------------------------"
    return ensemble


def evaluate(model, predict, queries = "no_query"):
    #predict = np.loadtxt(fn, delimiter=",", skiprows=1)

    #queries = predict[:, 1]
    #doc_id  = predict[:, 2] 
    #features = predict[:, 3:]
    #if ids == "no_id":
    #    ids = np.asarray(["no_id"]*len(predict))
    if queries == "no_query":
        queries = np.ones(len(predict))
    features=predict

    results = model.eval(features)
    #writer = csv.writer(open("result.csv"))
    #for line in zip(queries, results, doc_id):
    #        writer.writerow(line)
    #return "OK"
    return results


'''if __name__ == "__main__":
    parser = OptionParser()
    parser.add_option("-t", "--train", action="store", type="string", dest="train_file")
    parser.add_option("-v", "--validation", action="store", type="string", dest="val_file")
    parser.add_option("-p", "--predict", action="store", type="string", dest="predict_file")
    options, args = parser.parse_args()
    iterations = 30
    learning_rate = 0.001

    model = learn(options.train_file, options.val_file, n_trees=200)
    evaluate(model, options.predict_file)
'''

print "functions loaded"

In [None]:
ndcg_predictors = np.concatenate((np.ones((32339,1))\
      ,predictors_pd[["id"]].as_matrix(),predictors_pd[numerical_predictors].as_matrix()),axis=1)
ndcg_output = output_pd.sort_values(["cumulative_downloads_2016_02"],ascending=False)

ndcg_output['true_ranking'] = pd.Series(np.asarray(range(1,32340)), index=ndcg_output.index)
ndcg_output = pd.merge(predictors_pd[["id"]],ndcg_output,on= "id",how = "left")[["id","true_ranking","cumulative_downloads_2016_02"]]

#ndcg_predictors = np.concatenate((ndcg_output[["true_ranking"]].as_matrix(), ndcg_predictors),axis=1)
ndcg_predictors = np.concatenate((ndcg_output[["cumulative_downloads_2016_02"]].as_matrix(), ndcg_predictors),axis=1)

In [None]:
lst = range(32339)
#random.shuffle(lst)
train = lst[:20000]
validation = lst[20000:31339]
test = lst[31339:]

print train[-1]
print validation[0]
print validation[-1]
print test[0]
print len(test)


In [None]:
ndcg_model = learn(train_features,train_output,  n_trees=100, learning_rate=0.1, k=10)
predi = evaluate(ndcg_model,test_features)

In [None]:
predi = evaluate(ndcg_model,test_features)

In [None]:
zip_metric(predi,test_output)

In [None]:

#calculate performance
def zip_metric(true,pred):
    nb_tot = len(true)
    pred_thred = sorted(pred, reverse=True)[int(nb_tot*1.0/100)]
    true_thred = sorted(true, reverse=True)[int(nb_tot*1.0/100)]
    try:
        return len([x for x in range(nb_tot) if (pred[x]>pred_thred and true[x]>true_thred)])*1.0/nb_tot*100*100
    except:
        print "length of true and pred are different : "+str(len(pred))+"   :   "+str(len(pred))
    


In [None]:
true = range(30000)
pred = range(30050,30000,-1)+range(29950)
print zip_metric(range(30000),range(30000))

nb_tot = len(true)
pred_thred = sorted(pred, reverse=True)[int(nb_tot*1.0/100)]
true_thred = sorted(true, reverse=True)[int(nb_tot*1.0/100)]
try:
    print  len([x for x in range(nb_tot) if (pred[x]>pred_thred and true[x]>true_thred)])*1.0/nb_tot*100*100
except:
    print "length of true and pred are different : "+str(len(pred))+"   :   "+str(len(pred))

In [None]:
1.0*250/300


In [None]:
sorted(zip(sorted(zip(predi[:10],ndcg_predictors[test,0][:10])),range(10)),key = lambda x : x[0][1])

In [None]:
np.random.seed(1)
kf = KFold(len(predictors_pd), n_folds=K,shuffle = True)
for train, test in kf:
    ndcg_model = learn(predictors_pd[numerical_predictors].as_matrix()[train,:]\
                       ,output_pd["cumulative_downloads_2016_02"].as_matrix()[train]\
                       ,  n_trees=100, learning_rate=0.1, k=int(len(test)*1.0/100))
    predi = evaluate(ndcg_model,predictors_pd[numerical_predictors].as_matrix()[test,:])
    print zip_metric(output_pd["cumulative_downloads_2016_02"].as_matrix()[test],predi)

In [None]:
np.random.seed(1)
kf = KFold(len(predictors_pd), n_folds=K,shuffle = True)
for train, test in kf:
    ndcg_model = learn(predictors_pd[numerical_predictors].as_matrix()[train,:]\
                       ,output_pd["cumulative_downloads_2016_02"].as_matrix()[train]\
                       ,  n_trees=100, learning_rate=0.1, k=int(len(train)*1.0/100))
    predi = evaluate(ndcg_model,predictors_pd[numerical_predictors].as_matrix()[test,:])
    print zip_metric(output_pd["cumulative_downloads_2016_02"].as_matrix()[test],predi)

In [None]:
np.random.seed(1)
kf = KFold(len(predictors_pd), n_folds=K,shuffle = True)
for train, test in kf:
    ndcg_model = learn(predictors_pd[numerical_predictors].as_matrix()[train,:]\
                       ,output_pd["cumulative_downloads_2016_02"].as_matrix()[train]\
                       ,  n_trees=1000, learning_rate=0.1, k=int(len(test)*1.0/100))
    predi = evaluate(ndcg_model,predictors_pd[numerical_predictors].as_matrix()[test,:])
    print zip_metric(output_pd["cumulative_downloads_2016_02"].as_matrix()[test],predi)

In [None]:
np.random.seed(1)
kf = KFold(len(predictors_pd), n_folds=K,shuffle = True)
for train, test in kf:
    ndcg_model = learn(predictors_pd[numerical_predictors].as_matrix()[train,:]\
                       ,output_pd["cumulative_downloads_2016_02"].as_matrix()[train]\
                       ,  n_trees=1000, learning_rate=0.1, k=int(len(train)*1.0/100))
    predi = evaluate(ndcg_model,predictors_pd[numerical_predictors].as_matrix()[test,:])
    print zip_metric(output_pd["cumulative_downloads_2016_02"].as_matrix()[test],predi)

In [None]:
np.random.seed(1)
kf = KFold(len(predictors_pd), n_folds=K,shuffle = True)
for train, test in kf:
    ndcg_model = learn(predictors_pd[numerical_predictors].as_matrix()[train,:]\
                       ,output_pd["cumulative_downloads_2016_02"].as_matrix()[train]\
                       ,  n_trees=1000, learning_rate=0.01, k=int(len(test)*1.0/100))
    predi = evaluate(ndcg_model,predictors_pd[numerical_predictors].as_matrix()[test,:])
    print zip_metric(output_pd["cumulative_downloads_2016_02"].as_matrix()[test],predi)

In [None]:
np.random.seed(1)
kf = KFold(len(predictors_pd), n_folds=K,shuffle = True)
for train, test in kf:
    ndcg_model = learn(predictors_pd[numerical_predictors].as_matrix()[train,:]\
                       ,output_pd["cumulative_downloads_2016_02"].as_matrix()[train]\
                       ,  n_trees=1000, learning_rate=0.01, k=int(len(train)*1.0/100))
    predi = evaluate(ndcg_model,predictors_pd[numerical_predictors].as_matrix()[test,:])
    print zip_metric(output_pd["cumulative_downloads_2016_02"].as_matrix()[test],predi)

In [None]:
from sklearn.ensemble import AdaBoostRegressor

In [None]:
mod = GradientBoostingRegressor(n_estimators=1000, learning_rate=0.01,max_depth=2, random_state=0, loss='ls')\
    .fit(predictors_pd[numerical_predictors].as_matrix()[train,:], output_pd["cumulative_downloads_2016_02"].as_matrix()[train] )

In [None]:

modmod= AdaBoostRegressor(base_estimator=mod, n_estimators=1, learning_rate=1, loss='linear', random_state=None)

In [None]:
modmod.fit(predictors_pd[numerical_predictors].as_matrix()[train,:],output_pd["cumulative_downloads_2016_02"].as_matrix()[train])

In [None]:
predi = modmod.predict(predictors_pd[numerical_predictors].as_matrix()[test,:])
print zip_metric(output_pd["cumulative_downloads_2016_02"].as_matrix()[test],predi)

In [None]:
predi = mod.predict(predictors_pd[numerical_predictors].as_matrix()[test,:])
print zip_metric(output_pd["cumulative_downloads_2016_02"].as_matrix()[test],predi)

In [None]:

mod = GradientBoostingRegressor(n_estimators=100, learning_rate=0.01,max_depth=2, random_state=0, loss='ls')\
    .fit(predictors_pd[numerical_predictors].as_matrix()[train,:], output_pd["cumulative_downloads_2016_02"].as_matrix()[train] )
predi = modmod.predict(predictors_pd[numerical_predictors].as_matrix()[test,:])
print zip_metric(output_pd["cumulative_downloads_2016_02"].as_matrix()[test],predi)
for i in range(100):
    modmod= AdaBoostRegressor(base_estimator=modmod, n_estimators=1, learning_rate=1, loss='linear', random_state=None)
    modmod.fit(predictors_pd[numerical_predictors].as_matrix()[train,:],output_pd["cumulative_downloads_2016_02"].as_matrix()[train])
    predi = modmod.predict(predictors_pd[numerical_predictors].as_matrix()[test,:])
    print zip_metric(output_pd["cumulative_downloads_2016_02"].as_matrix()[test],predi)

In [None]:
for train,test in KFold(len(predictors_pd), n_folds=K,shuffle = True):
    train_features = predictors_pd[numerical_predictors].as_matrix()[train,:].astype(float)
    train_output = output_pd["cumulative_downloads_2016_02"].as_matrix()[train]
    #temp = sorted(zip(train_output,train_features),key = lambda x:-x[0])
    #train_feature = np.vstack([x[1] for x in temp])
    #train_output = [x[0] for x in temp]
    test_features = predictors_pd[numerical_predictors].as_matrix()[test,:].astype(float)
    test_output = output_pd["cumulative_downloads_2016_02"].as_matrix()[test]
    #temp = sorted(zip(test_output,test_features),key = lambda x:-x[0])
    #test_feature = np.vstack([x[1] for x in temp])
    #test_output = [x[0] for x in temp]
    relevance =(range(len(train)))
    relevance.sort(key=lambda x: train_output[x])
    relevance = (np.asarray(relevance)-int(99.0*len(train)/100)-np.zeros(len(train)))
    relevance[relevance<0] = 0
    #int(99.0*len(train)/100)
    
    modmod = GradientBoostingRegressor(n_estimators=1, learning_rate=0.1,max_depth=2, random_state=0, loss='ls')\
    .fit(train_features,train_output )
    predi = modmod.predict(test_features)
    print zip_metric(test_output,predi)
    
    train_predi = modmod.predict(train_features)
    point_error = (train_output - train_predi )
    
    pred_relevance =(range(len(train)))
    pred_relevance.sort(key=lambda x: train_predi[x])
    pred_relevance = (np.asarray(pred_relevance)-int(99.0*len(train)/100)-np.zeros(len(train)))
    pred_relevance[pred_relevance<0] = 0
    
    for i in range(100):
        print "score : "+str(np.dot(test_output-predi,test_output-predi))
        modmod= AdaBoostRegressor(base_estimator=modmod, n_estimators=1, learning_rate=0.01, loss='linear', random_state=None)
        modmod.fit(train_features,train_output,np.abs(pred_relevance-relevance))
        predi = modmod.predict(test_features)
        print zip_metric(test_output,predi)
    break

In [None]:
ndcg_model = learn(train_features,train_output,  n_trees=100, learning_rate=0.1, k=10)
predi = evaluate(ndcg_model,test_features)

In [None]:
np.random.seed(1)
for s in range(15):
    for train,test in KFold(len(predictors_pd), n_folds=K,shuffle = True):
        train_features = predictors_pd[numerical_predictors].as_matrix()[train,:].astype(float)
        train_output = output_pd["cumulative_downloads_2016_02"].as_matrix()[train]
        test_features = predictors_pd[numerical_predictors].as_matrix()[test,:].astype(float)
        test_output = output_pd["cumulative_downloads_2016_02"].as_matrix()[test]
        relevance =(range(len(train)))
        relevance.sort(key=lambda x: train_output[x])
        #relevance = (np.asarray(relevance)-int(99.0*len(train)/100))
        #relevance[relevance<0] = 0
        ndcg_model = learn(train_features,train_output,  n_trees=100, learning_rate=0.1, k=10)
        predi = evaluate(ndcg_model,test_features)
        print zip_metric(predi,test_output)

        modmod = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=2, random_state=0, loss='ls')\
        .fit(train_features,train_output )
        predi = modmod.predict(test_features)
        print zip_metric(test_output,predi)
        
        print "**********"
    print "______________"

In [None]:
pred = predi
true = test_output
nb_tot = len(true)
pred_thred = sorted(pred, reverse=True)[int(nb_tot*1.0/100)]
true_thred = sorted(true, reverse=True)[int(nb_tot*1.0/100)]
print len([x for x in range(nb_tot) if (pred[x]>pred_thred and true[x]>true_thred)])*1.0/nb_tot*100*100


In [None]:
indices.sort(key=lambda x: input[x])

In [None]:
point_error

In [None]:
np.ones(len(train))/point_error

In [None]:
train_output

In [None]:
predictors_pd[predictors_pd.week_1 >8.62 ]

In [None]:
output_pd[output_pd.id == "911686788iphone"]