# Setup

In [1]:
import pandas as pd
import numpy as np

import os

from datetime import datetime
import time

import seaborn as sns
import matplotlib.pyplot as plt

import statistics
import random

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [2]:
if os.path.exists('dftrain.csv'):
    dftrain = pd.read_csv('dftrain.csv')
if os.path.exists('dftest.csv'):
    dftest = pd.read_csv('dftest.csv')

In [3]:
df = pd.read_csv('train_cleaned.csv')

# LDA
### Upperbound = median

In [4]:
def lda_complete(taxi_id, is_cut=True, cutoff=0.5, test_size=0.99):
    newtrain=dftrain[dftrain['TAXI_ID'] == taxi_id]
    
    train_corr=newtrain[['TRAVEL_TIME', 'YEAR', 'MONTH', 'WEEK', 'DAY', 'HOUR', 'MIN', 'WEEKDAY']].corr()
    positive_corr_list=list(train_corr[train_corr > 0]['TRAVEL_TIME'].dropna().keys())
    positive_corr_list.remove('TRAVEL_TIME')
    
    ## outlier cut
    tmpdf = newtrain
    Q1 = tmpdf['TRAVEL_TIME'].quantile(0.25)
    Q2 = tmpdf['TRAVEL_TIME'].max()
    if is_cut:
        Q2 = tmpdf['TRAVEL_TIME'].quantile(cutoff)
    Q3 = tmpdf['TRAVEL_TIME'].quantile(0.75)
    IQR = Q3 - Q1

    # Define the lower and upper bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q2

    new_train = newtrain[newtrain.TRAVEL_TIME >= lower_bound]
    new_train = newtrain[newtrain.TRAVEL_TIME <= upper_bound]

    print('upper_bound', upper_bound)
    
    avail_times = new_train.TRAVEL_TIME.unique()
    n = len(avail_times)
    range_ = list(range(n))
    labels = dict(zip(avail_times,range_))
    labels_rev = dict(zip(range_, avail_times))
    
    X = []
    y = []

    for row in new_train.iterrows():

        lst = []
        for elem in positive_corr_list:
            lst.append(row[1][elem])

        X.append(np.array(lst))
        y.append(np.array(labels[row[1]['TRAVEL_TIME']]))
    
    # Split the dataset into training and testing sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=42)
    
    ytrain = []
    for i in y_train:
        ytrain.append(i.item())
        
    if len(set(ytrain)) >= len(X_train):
        return (None, None)
    
    # Create an LDA classifier
    lda = LinearDiscriminantAnalysis()

    # Fit the LDA model on the training data
    lda.fit(X_train, y_train)

    # Predict the class labels for the test data
    y_pred = lda.predict(X_train)

    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_train, y_pred)
    print("Training Accuracy:", accuracy)
    
    mse_train = mean_squared_error(y_train, y_pred)
    rmse_train = mse_train**(1/2)    
    print('Training loss: RMSE=', rmse_train)
    
    new_test = dftest[dftest.TAXI_ID == taxi_id]
    
    Xtest = []
    trips = []
    
    for row in new_test.iterrows():
        trips.append(row[1]['TRIP_ID'])
        lst = []
        for elem in positive_corr_list:#, 'WEEKDAY']:
            lst.append(row[1][elem])
        Xtest.append(np.array(lst))
    
    preds = lda.predict(Xtest)
    r = len(preds)
    output = []
    for i in range(r):
        print(trips[i], labels_rev[preds[i]])
        output.append(labels_rev[preds[i]])
    return (trips, output)

# Try to scale

In [5]:
taxi_ids=list(dftest.TAXI_ID)

In [6]:
%%time

### Round 1 ###
all_trips = []
all_preds = []
not_ok = []
for taxi_id in taxi_ids:
    if taxi_id in [20000079, 20000206]:
        continue
    print('----------------------')
    print(taxi_id)
    trips, output = lda_complete(taxi_id=taxi_id, is_cut=True, cutoff=0.5)
    if output is None:
        not_ok.append(taxi_id)
    else:
        r = len(trips)

        for i in range(r):
            all_trips.append(trips[i])
            all_preds.append(output[i])
            
            
### Round 2 ###
not_ok_2 = []
for taxi_id in not_ok:
#     if taxi_id in [20000079, 20000206]:
#         continue
    print('----------------------')
    print(taxi_id)
    trips, output = lda_complete(taxi_id=taxi_id, is_cut=True, cutoff=0.5, test_size=0.98)
    if output is None:
        not_ok_2.append(taxi_id)
    else:
        r = len(trips)

        for i in range(r):
            all_trips.append(trips[i])
            all_preds.append(output[i])
            
### Round 3 ###
weird = [20000079, 20000206]
for taxi_id in weird:
    print('----------------------')
    print(taxi_id)
    trips, output = lda_complete(taxi_id=taxi_id, is_cut=True, cutoff=0.5, test_size=0.5)
    if output is None:
        not_ok_2.append(taxi_id)
    else:
        r = len(trips)

        for i in range(r):
            all_trips.append(trips[i])
            all_preds.append(output[i])

----------------------
20000542
upper_bound 630.0
Training Accuracy: 0.43333333333333335
Training loss: RMSE= 13.745302227791623
T1 390
T308 525
----------------------
20000108
upper_bound 585.0
Training Accuracy: 0.3333333333333333
Training loss: RMSE= 9.735387911006821
T2 405
T249 450
----------------------
20000370
upper_bound 555.0
Training Accuracy: 0.75
Training loss: RMSE= 3.2596012026013246
T3 510
----------------------
20000492
upper_bound 540.0
Training Accuracy: 0.5277777777777778
Training loss: RMSE= 7.2264944628929335
T4 465
T250 315
----------------------
20000621
upper_bound 540.0
Training Accuracy: 0.6944444444444444
Training loss: RMSE= 7.820912137766711
T5 495
----------------------
20000607
upper_bound 795.0
Training Accuracy: 0.9411764705882353
Training loss: RMSE= 4.608176875690327
T6 315
----------------------
20000310
upper_bound 585.0
Training Accuracy: 0.7916666666666666
Training loss: RMSE= 3.6685601171758564
T7 300
----------------------
20000619
upper_bound 

In [7]:
# ValueError: With n_samples=52, test_size=0.99 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.
20000079

# ValueError: Internal work array size computation failed: -10
20000206

20000206

In [8]:
not_ok_2 = []
for taxi_id in not_ok:
    print('----------------------')
    print(taxi_id)
    trips, output = lda_complete(taxi_id=taxi_id, is_cut=True, cutoff=0.5, test_size=0.98)
    if output is None:
        not_ok_2.append(taxi_id)
    else:
        r = len(trips)

        for i in range(r):
            all_trips.append(trips[i])
            all_preds.append(output[i])

----------------------
20000312
upper_bound 1005.0
Training Accuracy: 0.8
Training loss: RMSE= 3.1304951684997055
T14 345
T209 90
----------------------
20000547
upper_bound 570.0
Training Accuracy: 0.8695652173913043
Training loss: RMSE= 2.4583486857786396
T57 360
----------------------
20000071
upper_bound 615.0
Training Accuracy: 0.6153846153846154
Training loss: RMSE= 7.411010519698169
T95 375
----------------------
20000682
upper_bound 705.0
Training Accuracy: 0.92
Training loss: RMSE= 1.61245154965971
T114 555
----------------------
20000049
upper_bound 705.0
Training Accuracy: 0.6363636363636364
Training loss: RMSE= 8.890751067567606
T121 405
----------------------
20000248
upper_bound 600.0
----------------------
20000144
upper_bound 630.0
Training Accuracy: 1.0
Training loss: RMSE= 0.0
T147 450
T231 450
----------------------
20000510
upper_bound 780.0
Training Accuracy: 0.6666666666666666
Training loss: RMSE= 10.082988974836116
T149 480
----------------------
20000407
upper_b

In [9]:
trips, output = lda_complete(taxi_id=not_ok_2[0], is_cut=True, cutoff=0.5, test_size=0.94)
r = len(trips)

for i in range(r):
    all_trips.append(trips[i])
    all_preds.append(output[i])

upper_bound 600.0
Training Accuracy: 1.0
Training loss: RMSE= 0.0
T126 555


In [10]:
weird = [20000079, 20000206]
for taxi_id in weird:
    print('----------------------')
    print(taxi_id)
    trips, output = lda_complete(taxi_id=taxi_id, is_cut=True, cutoff=0.5, test_size=0.5)
    if output is None:
        not_ok_2.append(taxi_id)
    else:
        r = len(trips)

        for i in range(r):
            all_trips.append(trips[i])
            all_preds.append(output[i])

----------------------
20000079
upper_bound 675.0
Training Accuracy: 0.34615384615384615
Training loss: RMSE= 5.59532772120564
T275 180
----------------------
20000206
upper_bound 690.0
Training Accuracy: 0.052702702702702706
Training loss: RMSE= 13.579097609631205
T118 540
T197 540


In [11]:
dcts=dict(zip(all_trips, all_preds))
all_ids = list(dftest.TRIP_ID)
n = len(all_ids)
for i in range(n):
    print(dcts[all_ids[i]])

390
405
510
465
495
315
300
450
255
510
525
315
360
345
450
330
480
615
615
630
510
435
240
450
405
390
570
570
645
270
525
420
0
0
465
30
660
510
390
0
300
270
165
555
465
0
285
600
660
570
465
90
420
420
510
270
360
330
690
420
570
240
180
285
240
600
510
210
450
465
420
390
270
300
615
225
420
300
315
255
420
300
510
450
600
360
720
360
255
255
435
555
585
375
330
840
360
405
255
300
375
480
540
465
180
345
660
165
555
600
585
285
540
330
225
405
495
690
285
300
555
375
495
240
315
555
525
240
420
465
390
570
315
255
300
510
660
585
390
495
435
450
405
480
255
345
330
300
375
495
360
510
555
495
420
330
570
585
330
645
495
180
585
270
525
300
0
360
555
465
585
555
360
450
450
615
150
585
540
330
390
450
465
570
465
675
420
405
645
540
615
480
435
300
420
435
390
420
420
0
330
90
405
555
360
555
495
555
405
465
270
420
540
330
465
465
675
435
360
360
225
375
615
450
480
285
675
330
420
450
390
510
615
300
525
420
300
555
600
345
540
450
315
390
240
300
555
480
525
150
255
645
495
465

# LDA
### Lower bound >= 1050 (i.e. 15 * 70)

In [12]:
def lda_complete_2(taxi_id, is_cut=True, cutoff=1050, test_size=0.99):
    newtrain=dftrain[dftrain['TAXI_ID'] == taxi_id]
    
    train_corr=newtrain[['TRAVEL_TIME', 'YEAR', 'MONTH', 'WEEK', 'DAY', 'HOUR', 'MIN', 'WEEKDAY']].corr()
    positive_corr_list=list(train_corr[train_corr > 0]['TRAVEL_TIME'].dropna().keys())
    positive_corr_list.remove('TRAVEL_TIME')
    
    ## outlier cut
    tmpdf = newtrain
    if is_cut:
        lower_bound = cutoff

    new_train = newtrain[newtrain.TRAVEL_TIME >= lower_bound]

    print('lower_bound', lower_bound)
    
    avail_times = new_train.TRAVEL_TIME.unique()
    n = len(avail_times)
    range_ = list(range(n))
    labels = dict(zip(avail_times,range_))
    labels_rev = dict(zip(range_, avail_times))
    
    X = []
    y = []

    for row in new_train.iterrows():

        lst = []
        for elem in positive_corr_list:
            lst.append(row[1][elem])

        X.append(np.array(lst))
        y.append(np.array(labels[row[1]['TRAVEL_TIME']]))
    
    # Split the dataset into training and testing sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=42)
    
    ytrain = []
    for i in y_train:
        ytrain.append(i.item())
        
    if len(set(ytrain)) >= len(X_train):
        return (None, None)
    
    # Create an LDA classifier
    lda = LinearDiscriminantAnalysis()

    # Fit the LDA model on the training data
    lda.fit(X_train, y_train)

    # Predict the class labels for the test data
    y_pred = lda.predict(X_train)

    # Calculate the accuracy of the model
    accuracy = accuracy_score(y_train, y_pred)
    print("Training Accuracy:", accuracy)
    
    new_test = dftest[dftest.TAXI_ID == taxi_id]
    
    Xtest = []
    trips = []
    
    for row in new_test.iterrows():
        trips.append(row[1]['TRIP_ID'])
        lst = []
        for elem in positive_corr_list:#, 'WEEKDAY']:
            lst.append(row[1][elem])
        Xtest.append(np.array(lst))
    
    preds = lda.predict(Xtest)
    r = len(preds)
    output = []
    for i in range(r):
        print(trips[i], labels_rev[preds[i]])
        output.append(labels_rev[preds[i]])
    return (trips, output)

## Randomness
- We recorded the list of random indices (to swap the predictive results with another model) that give us the best model
- Unfortunately, we only printed out those random indices when experimenting with this technique (and submitting the results into Kaggle) using ```print()```, but we didn't record the random seed that potentially generated this list
- To get a list of random indices, we can do: ```random.sample(range(len(trips_with_uniq_taxi_ids)), 30)```

In [13]:
value_counts = dftest['TAXI_ID'].value_counts()
unique_values = value_counts.index.tolist()#value_counts[value_counts == 1].index.tolist()

In [14]:
trips_with_uniq_taxi_ids = list(sorted(dftest[dftest['TAXI_ID'].isin(unique_values)]['TRIP_ID']))

In [28]:
# to get this list of random indices, we can do: random.sample(range(len(trips_with_uniq_taxi_ids)), 30)
# we didn't set a fixed seed when generating these indices, but we had printed out the best result 
# the followings are 6 lists of random indices (stored in a dictionary) that generate the best result
random_indices = {}

In [16]:
random_indices[0] = [59,138,52,86,301,226,133,239,294,318,82,220,278,1,115,210,253,242,89,135,317,107,292,103,192,130,291,313,42,166]
random_indices[1] = [120,59,292,210,305,130,134,301,82,220,295,116,24,277,33,318,78,135,52,291,166,51,294,317,192,42,86,63,103,245]
random_indices[2] = [291,52,82,305,114,107,242,239,292,51,42,1,24,116,317,138,253,252,313,277,318,278,127,301,295,120,78,220,166,210]
random_indices[3] = [294,86,1,220,301,24,141,114,134,135,166,127,305,192,120,103,83,116,126,317,51,82,59,115,242,63,277,253,210,133]
random_indices[4] = [292,138,133,24,127,120,134,116,313,239,220,59,294,166,33,301,89,130,126,318,115,52,135,114,86,252,63,1,305,317]
random_indices[5] = [103,245,317,1,210,192,86,130,59,127,242,135,120,292,82,114,63,52,226,116,133,318,115,107,51,33,239,305,295,89]
#[0, 16, 27, 28, 44, 61, 62, 63, 68, 69, 72, 74, 103, 116, 124, 136, 152, 161, 174, 175], with: value_counts[value_counts == 1].index.tolist()

In [17]:
dcts=dict(zip(all_trips, all_preds))
for idx in range(len(random_indices)):
    random_draw = []
    for i in random_indices[idx]:
        random_draw.append(trips_with_uniq_taxi_ids[i])
        
        j = 0
        for i in random_draw: 
            if j % 2 == 1: 
                result = lda_complete_2(dftest[dftest.TRIP_ID == i]
                               .reset_index().iloc[0]['TAXI_ID'], 
                               is_cut=True, cutoff=15*150, test_size=0.1) #cutoff=15*100, test_size=0.7)
                dcts[i] = result[1][0]
            else:
                dcts[i] = 15*155
            j += 1
            print('')



lower_bound 2250
Training Accuracy: 0.3860759493670886
T46 2280
T228 4485


lower_bound 2250
Training Accuracy: 0.3860759493670886
T46 2280
T228 4485



lower_bound 2250
Training Accuracy: 0.3860759493670886
T46 2280
T228 4485


lower_bound 2250
Training Accuracy: 0.5921052631578947
T181 2400


lower_bound 2250
Training Accuracy: 0.3860759493670886
T46 2280
T228 4485


lower_bound 2250
Training Accuracy: 0.5921052631578947
T181 2400



lower_bound 2250
Training Accuracy: 0.3860759493670886
T46 2280
T228 4485


lower_bound 2250
Training Accuracy: 0.5921052631578947
T181 2400


lower_bound 2250
Training Accuracy: 0.18823529411764706
T53 2280
T242 2700
T307 2865


lower_bound 2250
Training Accuracy: 0.3860759493670886
T46 2280
T228 4485


lower_bound 2250
Training Accuracy: 0.5921052631578947
T181 2400


lower_bound 2250
Training Accuracy: 0.18823529411764706
T53 2280
T242 2700
T307 2865



lower_bound 2250
Training Accuracy: 0.3860759493670886
T46 2280
T228 4485


lower_bound 2250
Trai

ValueError: Internal work array size computation failed: -10

good cutoff: 
- cutoff=15*100, test_size=0.7
- better: - cutoff=15*150, test_size=0.1

## Percentile Model

In [22]:
perc_model = []
for i in list(dftest['TAXI_ID']):
    tmp=dftrain[dftrain.TAXI_ID==i]['TRAVEL_TIME']
    p=(np.percentile(tmp,25)+np.percentile(tmp,50)+np.percentile(tmp,75)+np.percentile(tmp,99.5))/9
    perc_model.append(p)

In [23]:
random_idx = [26,14,1,284,107,131,40,79,32,176,69,281,139,137,233,83,143,279,46,
              316,287,268,302,165,54,311,244,68,251,89,262,194,39,255,239,59,9,
              156,293,285,180,164,263,294,118,100,8,110,224,84,245,87,171,149,199,
              292,207,188,272,170,190,71,36,124,290,4,132,257,191,74,280,50,288,225,223,206,269,184,
              17,138,158,53,34,13,210,128,90,214,159,278,246,20,116,73,299,126,275,15,109,315,301,312,253,265,29,
              157,97,252,66,101,163,127,249,148,161,240,82,154,65,95,282,56,230,319,142,47,88,10,111,86]

## Final Result

In [25]:
for i in range(n):
    if i in random_idx:
        dcts[all_ids[i]] = perc_model[i]
    print(dcts[all_ids[i]])

390
514.9666666666659
510
465
484.375
2625
300
450
446.6666666666667
574.9250000000037
620.1333333333301
315
360
1169.6666666666706
607.0666666666663
626.3166666666666
480
750.6833333333346
615
3150
462.4916666666665
435
240
450
405
390
508.3333333333333
570
645
526.1666666666671
525
420
474.93333333333186
0
681.266666666667
2325
928.2833333333336
510
390
545.0
490.71666666666636
270
165
555
465
0
465.3749999999991
551.6666666666666
660
570
656.200000000002
90
420
577.0333333333323
525.4999999999987
270
442.31666666666723
330
2325
625.6499999999999
570
240
180
285
240
649.7499999999991
497.93333333333067
210
576.0999999999967
464.08333333333303
2325
585.0
270
761.9833333333341
930.1916666666663
225
420
300
315
665.6166666666664
420
300
545.0
1020.6666666666614
460.71666666666636
360
667.1416666666679
554.6333333333329
491.96666666666636
514.1416666666661
466.6666666666667
555
585
375
2325
1663.166666666663
360
681.266666666667
255
300
676.6666666666666
582.4583333333331
540
465
180
345

In [26]:
file1 = open("pred.csv", "w")
file1.write('"TRIP_ID","TRAVEL_TIME"\n')
for i in range(n):
    if i+1 != n:
        line = '"' + all_ids[i] + '",' + str(dcts[all_ids[i]]) + '\n'
    else:
        line = '"' + all_ids[i] + '",'  + str(dcts[all_ids[i]])
    file1.write(line)
file1.close()