In [1]:
import time
from pyspark.sql.types import *
from pyspark.sql.functions import to_date, col, lit, unix_timestamp
from pyspark.ml.feature import StringIndexer, OneHotEncoder, PCA
from pyspark.sql.functions import to_timestamp, date_format, hour, year, month, dayofmonth
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression, GBTClassifier
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor, LinearRegression
from pyspark.ml.clustering import KMeans

In [2]:
sc

### Common Functions

In [3]:
def indexStringColumns(df, cols):
    newdf = df
    for c in cols:
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        sm = si.fit(newdf)
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

def make_dates(df, colName, newCol):
    return df.withColumn(newCol,to_timestamp(colName, 'yyyy-MM-dd HH:mm:ss')).drop(colName)

def get_dateinfo(df, colName):
    return df.withColumn('dow', date_format(colName,'u').cast(IntegerType()))\
    .withColumn('hour', hour(colName))\
    .withColumn('day', dayofmonth(colName))\
    .withColumn('month', month(colName))\
    .withColumn('year', year(colName))
    

def oneHotEncodeColumns(df, cols):
    newdf = df
    for c in cols:
        #For each given colum, create OneHotEncoder. 
        #dropLast : Whether to drop the last category in the encoded vector (default: true)
        onehotenc = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-onehot" 
        #and then drops the original columns.
        #and drop the "-onehot" suffix. 
        newdf = onehotenc.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf

### Main Study Functions

In [4]:
def get_green_data(sz='30M'):
    start = time.time()
    if sz == '30M':
        uri = "mongodb://ec2-35-162-204-59.us-west-2.compute.amazonaws.com/taxidb.taxidata30M_g"
    else:
        uri = "mongodb://ec2-35-162-204-59.us-west-2.compute.amazonaws.com/taxidb.taxidata10M_y"
    
    green_df = spark.read.format("com.mongodb.spark.sql.DefaultSource")\
        .option("uri",uri)\
        .load()
    green_df.repartition(24)
    
    select_cols = [x for x in green_df.columns if x not in ['_id','trip_type', 'ehail_fee'] ]
    g_df = green_df.select(select_cols)
    
    if sz == '3M':
        g_df = g_df.sample(False,0.3, seed=1)
    elif sz == '1M':
        g_df = g_df.sample(False,0.1, seed=1)
    elif sz == '500K':
        g_df = g_df.sample(False,0.05, seed=1)        
    elif sz == '100K':
        g_df = g_df.sample(False,0.01, seed=1)                
    elif sz == '10K':
        g_df = g_df.sample(False,0.001, seed=1)      

    g_df.cache()
    print 'green data loaded...', time.time() - start

    return g_df

def get_yellow_data(sz='30M'):
    start = time.time()
    if sz == '30M':
        uri = "mongodb://ec2-35-162-204-59.us-west-2.compute.amazonaws.com/taxidb.taxidata30M_y"
    else:
        uri = "mongodb://ec2-35-162-204-59.us-west-2.compute.amazonaws.com/taxidb.taxidata10M_g"
        
    yellow_df = spark.read.format("com.mongodb.spark.sql.DefaultSource")\
        .option("uri",uri)\
        .load()
    yellow_df.repartition(24)
    
    select_cols = [x for x in yellow_df.columns if x not in ['_id','trip_type', 'ehail_fee'] ]
    y_df = yellow_df.select(select_cols)
    
    if sz == '3M':
        y_df = y_df.sample(False,0.3, seed=1)
    elif sz == '1M':
        y_df = y_df.sample(False,0.1, seed=1)
    elif sz == '500K':
        y_df = y_df.sample(False,0.05, seed=1)        
    elif sz == '100K':
        y_df = y_df.sample(False,0.01, seed=1)                
    elif sz == '10K':
        y_df = y_df.sample(False,0.001, seed=1)                        
        
        
    y_df.cache()
    print 'yellow data loaded...', time.time() - start
    return y_df
    
    
def concat_data_set(sz='30M'):
    print 'combining datasets ...'
    g_df = get_green_data(sz)
    y_df = get_yellow_data(sz)
    
    total_df = y_df.unionAll(g_df)
    print 'repartitioning to 24 partitions ...'
    total_df.repartition(24)
    total_df.cache()
    return total_df

def feature_eng(total_df):
    start = time.time()
    print 'starting feature engineering...'
    timeDiff = (unix_timestamp('tpep_dropoff_datetime') - unix_timestamp('tpep_pickup_datetime'))
    df_dates = total_df.withColumn("Duration", timeDiff)
    df_dates = df_dates.filter("Duration < 7200 and Duration > 60" )
    df_dates = make_dates(df_dates, "tpep_dropoff_datetime", "dropoff_datetime")
    df_dates = make_dates(df_dates, "tpep_pickup_datetime", "pickup_datetime")
    new_df = indexStringColumns(df_dates, ["store_and_fwd_flag"])
    new_df = get_dateinfo(new_df, 'dropoff_datetime')
    new_df = new_df.drop("dropoff_datetime").drop("pickup_datetime").drop("store_and_fwd_flag").drop("ehail_fee").drop("trip_type")
    
    
    onehot_cols = [ x for x in new_df.columns if 'ID' in x]
    onehot_cols = onehot_cols + ['dow','day','month'] # took out weather + condition

    dfhot = oneHotEncodeColumns(new_df,onehot_cols)
    
    print 'feature eng complete ... ', time.time() - start
    return dfhot

def df2vec(df_in, study_type = 'classification'):
    if study_type == 'classification':
        df_for_model = df_in.withColumnRenamed("color","label")
        va = VectorAssembler(outputCol="features", inputCols=df_for_model.drop("label").columns) #except the last col.
        taxi_points = va.transform(df_for_model).select("features", "label")  
    else:
        df_travel = df_in.withColumnRenamed("Duration", "label")
        va = VectorAssembler(outputCol="features", inputCols=df_travel.drop("label").columns) #except the last col.
        taxi_points = va.transform(df_travel).select("features", "label")

    split_data = taxi_points.randomSplit([0.8, 0.2])
    training = split_data[0].cache()
    test = split_data[1].cache()
    
    print 'data converted into train and test vecs...'
    return training, test


def run_model(df_for_model, sz='30M',model='rfc'):
    if model in ('rfc', 'logreg'):
        training, test = df2vec(df_for_model, study_type = 'classification')
    else:
        training, test = df2vec(df_for_model, study_type = 'regression')
    
    print 'creating model and fitting ...'
    start = time.time()
    if model == 'rfc':
        m = RandomForestClassifier(maxDepth=15)
    elif model == 'linreg':
        m = LinearRegression(maxIter=20, regParam=0.3, elasticNetParam=0.8)
    elif model == 'rfr':
        m = RandomForestRegressor(maxDepth=15)
    elif model == 'logreg':
        m = LogisticRegression()
    elif model == 'PCA':
        m = PCA(k=2, inputCol="features", outputCol="pcaFeatures") 
    elif model == 'Kmeans':
        m = KMeans().setK(10).setFeaturesCol("features").setPredictionCol("prediction")
 
    m_fit = m.fit(training)
    print '='*20 + '%s  MODEL RESULTS' % model + '='*20
    print '%s for %s fitting time: %f' % (sz, model, time.time() - start)
    start = time.time()
    
    if model in ('rfc','rfr', 'logreg','linreg'):
        predicts_train = m_fit.transform(training)
        predicts_test = m_fit.transform(test)
        print '%s for %s prediction time: %f' % (sz, model, time.time() - start)
        
    if model in ('rfc', 'logreg'):
        evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
        train_accuracy = evaluator.evaluate(predicts_train)
        test_accuracy = evaluator.evaluate(predicts_test)
        print "train acc %g" % train_accuracy
        print "train acc %g" % test_accuracy
        
    elif model in ('rfr', 'linreg'):
        evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
        test_rmse = evaluator.evaluate(predicts_test)
        train_rmse = evaluator.evaluate(predicts_train)
        print "train RMSE = %f " % test_rmse
        print "train RMSE = %f " % train_rmse

In [5]:
sz = '500K'
total_df = concat_data_set(sz)
df_for_model = feature_eng(total_df)

combining datasets ...
green data loaded... 3.02056002617
yellow data loaded... 1.32047605515
repartitioning to 24 partitions ...
starting feature engineering...
feature eng complete ...  56.867620945


In [6]:
run_model(df_for_model, sz=sz, model='linreg')
run_model(df_for_model, sz=sz, model='linreg')
run_model(df_for_model, sz=sz, model='linreg')
run_model(df_for_model, sz=sz, model='linreg')
run_model(df_for_model, sz=sz, model='linreg')

data converted into train and test vecs...
creating model and fitting ...
500K for linreg fitting time: 7.787707
500K for linreg prediction time: 0.091580
train RMSE = 326.395036 
train RMSE = 305.137645 
data converted into train and test vecs...
creating model and fitting ...
500K for linreg fitting time: 4.017936
500K for linreg prediction time: 0.067330
train RMSE = 317.024114 
train RMSE = 308.111395 
data converted into train and test vecs...
creating model and fitting ...
500K for linreg fitting time: 3.816162
500K for linreg prediction time: 0.081307
train RMSE = 302.835045 
train RMSE = 311.275775 
data converted into train and test vecs...
creating model and fitting ...
500K for linreg fitting time: 3.685706
500K for linreg prediction time: 0.086439
train RMSE = 308.716170 
train RMSE = 311.727834 
data converted into train and test vecs...
creating model and fitting ...
500K for linreg fitting time: 3.845075
500K for linreg prediction time: 0.064128
train RMSE = 306.306803 


In [7]:
run_model(df_for_model, sz=sz, model='PCA')
run_model(df_for_model, sz=sz, model='PCA')
run_model(df_for_model, sz=sz, model='PCA')
run_model(df_for_model, sz=sz, model='PCA')
run_model(df_for_model, sz=sz, model='PCA')

data converted into train and test vecs...
creating model and fitting ...
500K for PCA fitting time: 5.606237
data converted into train and test vecs...
creating model and fitting ...
500K for PCA fitting time: 4.056246
data converted into train and test vecs...
creating model and fitting ...
500K for PCA fitting time: 4.281460
data converted into train and test vecs...
creating model and fitting ...
500K for PCA fitting time: 4.246748
data converted into train and test vecs...
creating model and fitting ...
500K for PCA fitting time: 4.248063


In [8]:
run_model(df_for_model, sz=sz, model='Kmeans')
run_model(df_for_model, sz=sz, model='Kmeans')
run_model(df_for_model, sz=sz, model='Kmeans')
run_model(df_for_model, sz=sz, model='Kmeans')
run_model(df_for_model, sz=sz, model='Kmeans')


data converted into train and test vecs...
creating model and fitting ...
500K for Kmeans fitting time: 7.911419
data converted into train and test vecs...
creating model and fitting ...
500K for Kmeans fitting time: 7.422651
data converted into train and test vecs...
creating model and fitting ...
500K for Kmeans fitting time: 6.765577
data converted into train and test vecs...
creating model and fitting ...
500K for Kmeans fitting time: 6.512365
data converted into train and test vecs...
creating model and fitting ...
500K for Kmeans fitting time: 6.549885


In [9]:
run_model(df_for_model, sz=sz, model='logreg')
run_model(df_for_model, sz=sz, model='logreg')
run_model(df_for_model, sz=sz, model='logreg')
run_model(df_for_model, sz=sz, model='logreg')
run_model(df_for_model, sz=sz, model='logreg')

data converted into train and test vecs...
creating model and fitting ...
500K for logreg fitting time: 16.419207
500K for logreg prediction time: 0.067501
train acc 0.963093
train acc 0.961221
data converted into train and test vecs...
creating model and fitting ...
500K for logreg fitting time: 15.814246
500K for logreg prediction time: 0.116042
train acc 0.962869
train acc 0.962288
data converted into train and test vecs...
creating model and fitting ...
500K for logreg fitting time: 17.064737
500K for logreg prediction time: 0.076642
train acc 0.962885
train acc 0.962351
data converted into train and test vecs...
creating model and fitting ...
500K for logreg fitting time: 15.087999
500K for logreg prediction time: 0.066341
train acc 0.962781
train acc 0.962994
data converted into train and test vecs...
creating model and fitting ...
500K for logreg fitting time: 16.261257
500K for logreg prediction time: 0.074874
train acc 0.96258
train acc 0.963671


In [10]:
run_model(df_for_model, sz=sz, model='rfc')
run_model(df_for_model, sz=sz, model='rfc')
run_model(df_for_model, sz=sz, model='rfc')
run_model(df_for_model, sz=sz, model='rfc')
run_model(df_for_model, sz=sz, model='rfc')

data converted into train and test vecs...
creating model and fitting ...
500K for rfc fitting time: 29.745894
500K for rfc prediction time: 0.105231
train acc 0.914849
train acc 0.913559
data converted into train and test vecs...
creating model and fitting ...
500K for rfc fitting time: 27.855503
500K for rfc prediction time: 0.113831
train acc 0.910865
train acc 0.909672
data converted into train and test vecs...
creating model and fitting ...
500K for rfc fitting time: 27.899847
500K for rfc prediction time: 0.106252
train acc 0.911913
train acc 0.911021
data converted into train and test vecs...
creating model and fitting ...
500K for rfc fitting time: 27.463545
500K for rfc prediction time: 0.109723
train acc 0.909619
train acc 0.908241
data converted into train and test vecs...
creating model and fitting ...
500K for rfc fitting time: 26.711805
500K for rfc prediction time: 0.118944
train acc 0.914457
train acc 0.913227


In [11]:
run_model(df_for_model, sz=sz, model='rfr')
run_model(df_for_model, sz=sz, model='rfr')
run_model(df_for_model, sz=sz, model='rfr')
run_model(df_for_model, sz=sz, model='rfr')
run_model(df_for_model, sz=sz, model='rfr')

data converted into train and test vecs...
creating model and fitting ...
500K for rfr fitting time: 200.076828
500K for rfr prediction time: 4.424872
train RMSE = 173.109185 
train RMSE = 152.148342 
data converted into train and test vecs...
creating model and fitting ...


KeyboardInterrupt: 