#### Initial Setup

In [1]:
#imported libraries
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import numpy as np
import scipy as sp
import math
import matplotlib.pyplot as plt
import langdetect
import datetime
%matplotlib inline  
from sklearn.svm import SVR
from sklearn import linear_model
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import Imputer

import findspark; findspark.init()
import pyspark
import os
from pyspark.sql.types import *
from pyspark.sql.types import Row
import pyspark.sql.functions as func

os.environ["PYSPARK_SUBMIT_ARGS"] = (
  "--packages com.databricks:spark-csv_2.11:1.4.0 pyspark-shell"
)

#NEED TO ADD "  SPARK_DRIVER_MEMORY=5G   "  to ./conf/spark-env.sh 

try:
    sc = pyspark.SparkContext()
except Exception as e:
    print "SparkContext exists... Continuing on."
    
sqlCtx = pyspark.sql.SQLContext(sc)



#### Load files

In [2]:
downloads = sqlCtx.read \
    .format('com.databricks.spark.csv') \
    .options(header='true',inferSchema='true') \
    .load('train_app_downloads.csv').drop('')
ratings = sqlCtx.read \
    .format('com.databricks.spark.csv') \
    .options(header='true',inferSchema='true') \
    .load('train_app_rating.csv').drop('')
usages = sqlCtx.read \
    .format('com.databricks.spark.csv') \
    .options(header='true',inferSchema='true') \
    .load('train_usage.csv').drop('')
revenues = sqlCtx.read \
    .format('com.databricks.spark.csv') \
    .options(header='true',inferSchema='true') \
    .load('train_revenue.csv').drop('')
output = sqlCtx.read \
    .format('com.databricks.spark.csv') \
    .options(header='true',inferSchema='true') \
    .load('train_final_downloads.csv').drop('')
prev_downloads = sqlCtx.read \
    .format('com.databricks.spark.csv') \
    .options(header='true',inferSchema='true') \
    .load('train_cumulative_downloads_2015-02.csv').drop('')  
release_date = sqlCtx.read \
    .format('com.databricks.spark.csv') \
    .options(header='true',inferSchema='true') \
    .load('train_release_date.csv').drop('')
text_score = sqlCtx.read \
    .format('com.databricks.spark.csv') \
    .options(header='false',inferSchema='true') \
    .load('sentiment.csv').drop('')
title_score = sqlCtx.read \
    .format('com.databricks.spark.csv') \
    .options(header='false',inferSchema='true') \
    .load('t_sentiment.csv').drop('')
    
reviews = pd.read_csv('train_app_review.csv')
reviews_schema = StructType([
    StructField("id",IntegerType(),True),
    StructField("name",StringType(),True),
    StructField("country",StringType(),True),
    StructField("rating",IntegerType(),True),
    StructField("date",StringType(),True),
    StructField("title",StringType(),True),
    StructField("version",StringType(),True),
    StructField("text",StringType(),True),
    StructField("reviewer",StringType(),True)
    
])
reviews = sqlCtx.createDataFrame(reviews,reviews_schema)

#### Generate Predictors

In [3]:
old_dateRange = pd.date_range('2015-03-01', periods=56).format(formatter=lambda x: x.strftime('%Y-%m-%d'))
dateRange = pd.date_range('2015-03-01', periods=56).format(formatter=lambda x: x.strftime('%Y_%m_%d'))
for d in range(56):
    revenues = revenues.withColumnRenamed(old_dateRange[d],dateRange[d])
    usages = usages.withColumnRenamed(old_dateRange[d],dateRange[d])
    downloads = downloads.withColumnRenamed(old_dateRange[d],dateRange[d])

In [4]:
#Initialization
predictors = downloads['id','name','category','device']

In [5]:
# Generate the weekly downloads
sqlCtx.registerDataFrameAsTable(downloads, "downloads")
predictors = sqlCtx.sql("SELECT id, name, category, device , "+\
           "+".join(dateRange[0:7])+" AS week_1 ,"+\
           "+".join(dateRange[7:14])+" AS week_2 ,"+\
           "+".join(dateRange[14:21])+" AS week_3 ,"+\
           "+".join(dateRange[21:28])+" AS week_4 ,"+\
           "+".join(dateRange[28:35])+" AS week_5 ,"+\
           "+".join(dateRange[35:42])+" AS week_6 ,"+\
           "+".join(dateRange[42:49])+" AS week_7 ,"+\
           "+".join(dateRange[49:56])+" AS week_8 ,"+\
           "+".join(dateRange)+" AS download_sum \
           from downloads")

In [6]:
# Make coefficients

def get_coefficients(*args):
    #The first element of the list is the degree of the coefficient
    args = list(args)
    return  float(np.polyfit(range(56),np.cumsum(args[1:]),args[0])[0])
    
#Generate the step max and min 
def get_maxStep(maximum,*args):
    args=list(args)
    if (np.count_nonzero(args) == 0):
        return 0
    m = 0
    for d in range(1,56):
        if (args[d]!=0 and args[d-1]!=0):
            c = (args[d]-args[d-1])
            if (maximum and m < c):
                m = c
            if ( not maximum and m > c):
                m = c
    return m

def get_std(*args):
    return float(np.std(list(args)))

def get_nbMissing(*args):
    return list(args).count(-1)
replacementValue = 0
#Generate the daily average
def get_dailyAvg(*inp):
    if (np.count_nonzero(inp - replacementValue*np.ones(len(inp))) == 0):
        return 0
    return  (1.0*sum(inp)/np.count_nonzero(inp - replacementValue*np.ones(len(inp))))

def get_usage_coefficients(*args):
    #The first element of the list is the degree of the coefficient
    args = list(args)
    return  float(np.polyfit(range(8),args[1:],args[0])[0])

def get_revenue_coefficients(*args):
    #The first element of the list is the degree of the coefficient
    args = list(args)
    return  float(np.polyfit(range(56),args[1:],args[0])[0])

sqlCtx.registerFunction("get_nbMissing", get_nbMissing,returnType=IntegerType())
sqlCtx.registerFunction("get_std", get_std,returnType=FloatType())
sqlCtx.registerFunction("get_maxStep", get_maxStep,returnType=IntegerType())
sqlCtx.registerFunction("get_coefficients", get_coefficients,returnType=FloatType())
sqlCtx.registerFunction("daily_avg", get_dailyAvg,returnType=FloatType())
sqlCtx.registerFunction("get_usage_coefficients", get_usage_coefficients,returnType=FloatType())
sqlCtx.registerFunction("get_revenue_coefficients", get_revenue_coefficients,returnType=FloatType())

temp_downloads = sqlCtx.sql("SELECT id,name,category, device \
, get_coefficients(0,"+",".join(dateRange)+") AS coef_0 \
,get_coefficients(1,"+",".join(dateRange)+") AS coef_1 \
,get_coefficients(2,"+",".join(dateRange)+") AS coef_2 \
,get_coefficients(3,"+",".join(dateRange)+") AS coef_3 \
,get_maxStep(True,"+",".join(dateRange)+") AS max_step \
,get_maxStep(False,"+",".join(dateRange)+") AS min_step \
,get_std("+",".join(dateRange)+") AS downloads_std \
,get_nbMissing("+",".join(dateRange)+") AS nb_missing \
,daily_avg(" + ",".join(dateRange[0:56]) + ") AS daily_avg \
 FROM downloads")

sqlCtx.registerDataFrameAsTable(usages, "usages")
temp_usages = sqlCtx.sql("SELECT id, name, category, device, metric, \
get_usage_coefficients(0,"+",".join(usages.columns[5:13])+") AS u_coef_0, \
get_usage_coefficients(1,"+",".join(usages.columns[5:13])+") AS u_coef_1, \
get_usage_coefficients(2,"+",".join(usages.columns[5:13])+") AS u_coef_2 FROM usages")

sqlCtx.registerDataFrameAsTable(revenues, "revenues")
temp_revenues = sqlCtx.sql("SELECT id, name, category, device, \
get_revenue_coefficients(0,"+",".join(revenues.columns[4:])+") AS rev_coef_0, \
get_revenue_coefficients(1,"+",".join(revenues.columns[4:])+") AS rev_coef_1, \
get_revenue_coefficients(2,"+",".join(revenues.columns[4:])+") AS rev_coef_2 FROM revenues")

predictors = predictors.join(temp_downloads,["id","name","category","device"])



In [7]:
# previous downloads addition
predictors = predictors.join(prev_downloads,["id","device"])

In [8]:
# Days since release generation
def get_days(date):
    return (datetime.datetime.strptime('03/01/2015', '%m/%d/%Y').date() \
            - datetime.datetime.strptime(date, '%m/%d/%Y').date()).days

sqlCtx.registerDataFrameAsTable(release_date, "release_date")
sqlCtx.registerFunction("get_days", get_days,returnType=IntegerType())
temp_date = sqlCtx.sql("SELECT id,name \
, get_days(release_date) AS days_since_release \
 FROM release_date")

predictors = predictors.join(temp_date,["id","name"],"left")

In [9]:
#ratings generation
sqlCtx.registerDataFrameAsTable(ratings, "ratings")
temp_ratings = sqlCtx.sql("SELECT id,name,category \
, start1/(start1+star2+star3+star4+star5) AS star1 \
, star2/(start1+star2+star3+star4+star5) AS star2 \
, star3/(start1+star2+star3+star4+star5) AS star3 \
, star4/(start1+star2+star3+star4+star5) AS star4 \
, star5/(start1+star2+star3+star4+star5) AS star5 \
, (start1+star2+star3+star4+star5) AS num_ratings \
 FROM ratings")

predictors = predictors.join(temp_ratings,["id","name","category"],"left")

In [10]:
# Categories
list_categories = [ x.category.replace(" ","_") for x in sqlCtx.sql("SELECT category \
 FROM downloads\
 group by category \
 ").collect()]
for cat in list_categories:
    sqlCtx.registerDataFrameAsTable(predictors, "predictors")
    predictors=sqlCtx.sql('''SELECT *, CASE WHEN (category = "'''+cat+'''") THEN 1 ELSE 0 END AS '''+cat+''' FROM predictors''')


In [11]:
# Device
sqlCtx.registerDataFrameAsTable(predictors, "predictors")
predictors=sqlCtx.sql('''SELECT *, CASE WHEN (device = "iphone") THEN 1 ELSE 0 END AS iphone FROM predictors''')
sqlCtx.registerDataFrameAsTable(predictors, "predictors")
predictors=sqlCtx.sql('''SELECT *, CASE WHEN (device = "ipad") THEN 1 ELSE 0 END AS ipad FROM predictors''')

In [12]:
def get_language(x):
    try:
        detected = langdetect.detect_langs(x.decode('utf8','ignore'))[0]
        if detected.prob < 0.7:
            return "other"
        else :
            return detected.lang
    except:
        return "other"
sqlCtx.registerFunction("get_language", get_language,returnType=StringType())

In [13]:
#Language of the title
lang = ['ja','zh-cn','ko','en']
for l in lang:
    sqlCtx.registerDataFrameAsTable(predictors, "predictors")
    predictors=sqlCtx.sql('''SELECT *, CASE WHEN (get_language(name) = "'''+l+'''") THEN 1 \
    ELSE 0 END AS '''+l.replace("-","_")+''' FROM predictors''')

In [15]:
#Reviews 
#escape is used in case some asshole used - or [space] anywhere
def escape(text):
    return text.replace(" ","_").replace("-","_")
# number of reviews
def get_recentReviews(date):
    return int((datetime.datetime.strptime('03/01/2015', '%m/%d/%Y').date() \
            - datetime.datetime.strptime(date, '%Y-%m-%d').date()).days >=0)

#First step
list_countries =['United_States', 'France', 'Japan', 'Spain', 'United_Kingdom','Saudi_Arabia', 'Germany'\
     , 'Hong_Kong', 'Switzerland', 'Turkey','Netherlands', 'Australia', 'Norway', 'Sweden', 'China', 'Canada'\
     ,'Tanzania', 'Denmark', 'South_Korea', 'Italy', 'Finland', 'Taiwan','Russia', 'Philippines', 'Slovenia'\
     , 'Ireland', 'Belgium', 'Mexico','Austria', 'India', 'Brazil', 'Benin', 'New_Zealand','United_Arab_Emirates'\
     , 'Ukraine', 'Poland', 'Israel', 'Portugal','Tunisia', 'Mali', 'Slovakia', 'Zimbabwe', 'Thailand', 'Panama'\
     ,'Indonesia', 'Singapore', 'Greece', 'Senegal', 'Nicaragua','Hungary', 'Czech_Republic', 'Macedonia', 'Chile'\
     , 'Uruguay','Malaysia', 'Algeria', 'Nepal', 'Mauritania', 'Croatia']

cmd = '''review_rdd = reviews\
.map(lambda x : (x.id , Row(id = x.id , avg_review = x.rating \
, recent_review = get_recentReviews(x.date), nb_review = 1\
,version = set([x.version])'''
for c in list_countries:
    cmd+=","+c+''' = int( escape(x.country) == "'''+c+'''")'''
cmd+=")))"
exec cmd

#Group step
cmd = '''review_rdd = review_rdd.reduceByKey(lambda x1 ,x2 : Row(\
 avg_review = x1.avg_review + x2.avg_review\
   ,recent_review = x1.recent_review + x2.recent_review, nb_review = x1.nb_review + x2.nb_review'''
for c in list_countries:
    cmd+=" , "+c+" = x1."+c+" + x2."+c
cmd+=", version = x1.version.union(x2.version)))"
exec cmd

# Clean the grouped rdd
cmd = '''review_rdd = review_rdd.map(lambda (id , x) : [ id \
,  1.0*x.avg_review /  x.nb_review\
   , x.recent_review,  x.nb_review'''
for c in list_countries:
    cmd+=" , 1.0* x."+c+"/ x.nb_review"
cmd+=",  len(x.version) - 1])"
exec cmd

#Put back into dataframe
grp_reviews = sqlCtx.createDataFrame(review_rdd, ["id","avg_review"\
             ,"recent_reviews","nb_review"] + list_countries + ["versions"])

#Join with predictors 
predictors = predictors.join(grp_reviews,["id"],"left")

In [16]:
# Generate DL Projection
sqlCtx.registerDataFrameAsTable(predictors, "predictors")
dl_projection = sqlCtx.sql("SELECT id, device, (download_sum+7*`cumulative_downloads_2015-02`) AS dl_projection \
                                FROM predictors")
predictors = predictors.join(dl_projection,["id","device"],"left")

In [17]:
predictors_Pandas = predictors.toPandas()

In [18]:
predictors_Pandas

Unnamed: 0,id,device,name,category,week_1,week_2,week_3,week_4,week_5,week_6,week_7,week_8,download_sum,coef_0,coef_1,coef_2,coef_3,max_step,min_step,downloads_std,nb_missing,daily_avg,cumulative_downloads_2015-02,days_since_release,star1,star2,star3,star4,star5,num_ratings,Social_Networking,Finance,Books,Business,Newsstand,Games,Navigation,News,Music,Weather,Catalogs,Health_and_Fitness,Food_and_Drink,Shopping,Lifestyle,Productivity,Sports,Reference,Utilities,Education,Photo_and_Video,Entertainment,Medical,Travel,iphone,ipad,ja,zh_cn,ko,en,avg_review,recent_reviews,nb_review,United_States,France,Japan,Spain,United_Kingdom,Saudi_Arabia,Germany,Hong_Kong,Switzerland,Turkey,Netherlands,Australia,Norway,Sweden,China,Canada,Tanzania,Denmark,South_Korea,Italy,Finland,Taiwan,Russia,Philippines,Slovenia,Ireland,Belgium,Mexico,Austria,India,Brazil,Benin,New_Zealand,United_Arab_Emirates,Ukraine,Poland,Israel,Portugal,Tunisia,Mali,Slovakia,Zimbabwe,Thailand,Panama,Indonesia,Singapore,Greece,Senegal,Nicaragua,Hungary,Czech_Republic,Macedonia,Chile,Uruguay,Malaysia,Algeria,Nepal,Mauritania,Croatia,versions,dl_projection
0,310755560,ipad,Rebtel - Cheap International Voice Calls,Social Networking,2074,1881,2181,1969,2086,1620,1740,1825,15376,8061.785645,275.206024,-6.349383e-01,-1.183873e-02,59,-70,34.969303,0,274.571442,373574,1964.0,0.137190,0.044592,0.070302,0.188109,0.559807,9957.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,2.296296,2.0,27.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2630394
1,319691481,iphone,Ocado,Shopping,2204,2352,3342,3274,3196,1712,2072,2277,20429,10763.588867,382.353271,-1.407032e+00,-9.737281e-02,265,-295,117.610802,0,364.803558,300102,2065.0,0.302861,0.103586,0.168896,0.140375,0.284281,10764.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2.285714,0.0,14.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2121143
2,325658560,ipad,BFMTV : l'info en continu,News,3213,3260,3113,6196,2607,1609,1220,2055,23273,13514.303711,437.845276,-4.559468e+00,-1.761865e-01,341,-359,228.732361,0,415.589294,216942,608.0,0.135135,0.108108,0.081081,0.162162,0.513514,37.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2.954545,1.0,66.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1541867
3,329748160,ipad,Ricette,Food and Drink,47,51,8,-7,21,53,6,53,232,118.267860,3.102085,8.344071e-03,3.558010e-03,26,-22,7.709986,38,4.142857,23884,1993.0,0.235294,0.235294,0.176471,0.088235,0.264706,34.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,167420
4,339556760,ipad,Winmau Darts Scorer Lite,Sports,116,252,216,95,125,174,51,131,1160,657.339294,20.357040,-1.762069e-01,4.450537e-04,58,-51,18.716602,13,20.714285,12923,1927.0,0.399237,0.176732,0.164654,0.087095,0.172282,1573.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,91621
5,342792281,iphone,Old Navy,Shopping,6408,6959,6434,6715,6625,6968,6963,6389,53461,27154.427734,963.577881,4.155779e-01,-4.174683e-03,411,-236,135.089432,0,954.660706,691345,1906.0,0.270254,0.148903,0.174037,0.117032,0.289774,11578.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,2.000000,0.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4892876
6,343864881,iphone,GTA IV Cheats Guide - FREE,Reference,1478,1436,1214,1221,1350,1328,1275,1297,10599,5490.606934,186.236984,-1.003128e-01,1.172101e-02,60,-48,22.382624,0,189.267853,273515,,0.257841,0.092406,0.119400,0.096102,0.434252,8928.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1925204
7,365876760,ipad,Kur'an Fihristi,Reference,157,132,157,172,134,49,117,163,1081,572.982117,18.543301,-1.022780e-01,-1.016554e-03,58,-28,11.245506,8,19.303572,26093,1790.0,0.077143,0.051429,0.062857,0.065714,0.742857,350.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1.800000,0.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,183732
8,366078960,ipad,Handball Board Free,Sports,252,278,262,222,235,221,228,308,2006,1019.071411,34.704510,-3.299589e-02,4.488125e-03,13,-11,5.600087,0,35.821430,44770,1787.0,0.385350,0.124204,0.108280,0.092357,0.289809,314.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,315396
9,368497281,iphone,Fox8 Max Weather,Weather,789,616,580,603,551,820,882,638,5479,2736.035645,95.987015,3.912184e-01,1.240870e-02,58,-51,24.842918,0,97.839287,99115,1760.0,0.250000,0.104167,0.187500,0.125000,0.333333,48.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,699284


In [14]:
predictors.head(1)

[Row(id=306685844, name=u'Bible Promises', category=u'Reference', device=u'iphone', week_1=2030, week_2=1489, week_3=2147, week_4=1803, week_5=2063, week_6=2099, week_7=2092, week_8=1495, download_sum=15218, coef_0=7829.3037109375, coef_1=277.8722839355469, coef_2=0.3066839575767517, coef_3=-0.025372017174959183, max_step=201, min_step=-302, downloads_std=60.261112213134766, nb_missing=1, daily_avg=271.75, cumulative_downloads_2015-02=56532, days_since_release=2188, star1=0.023523876734885908, star2=0.014820042342978124, star3=0.024935309338979063, star4=0.11079745942131264, star5=0.8259233121618442, num_ratings=4251, Social_Networking=0, Finance=0, Books=0, Business=0, Newsstand=0, Games=0, Navigation=0, News=0, Music=0, Weather=0, Catalogs=0, Health_and_Fitness=0, Food_and_Drink=0, Shopping=0, Lifestyle=0, Productivity=0, Sports=0, Reference=1, Utilities=0, Education=0, Photo_and_Video=0, Entertainment=0, Medical=0, Travel=0, iphone=1, ipad=0, ja=0, zh_cn=0, ko=0, en=0)]

In [20]:
release_date.head(1)

[Row(id=281704574, name=u'AIM: Chat, Free Text, Photo Share, Voice Message', release_date=u'7/11/2008')]