In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import os
os.chdir("/content/drive/My Drive/Colab Notebooks/MSc Proj/data/nyc_trip")
!ls

trip_data_10.csv	   trip_data_1_fixed.pkl  trip_data_7.csv
trip_data_11.csv	   trip_data_2.csv	  trip_data_8.csv
trip_data_12.csv	   trip_data_3.csv	  trip_data_9.csv
trip_data_1and2_fixed.pkl  trip_data_4.csv	  trip_data_all_fixed.pkl
trip_data_1and2.pkl	   trip_data_5.csv	  trip_data_all.pkl
trip_data_1.csv		   trip_data_6.csv


In [3]:
import numpy as np
def reduce_mem_usage(df, time_cols):
   
    start_mem = df.memory_usage().sum() / 1024**2
    start_mem_gb = df.memory_usage().sum() / 1024**3
    print('Memory usage of dataframe is {:.2f} MB/ {:.2f}GB'.format(start_mem,start_mem_gb))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            if col not in time_cols:
                df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    # return df

def deal_space_in_col(df):
    tmp = []
    for col in df.columns:
        tmp.append(col.strip())
    df.columns = tmp
    df.columns

In [6]:
#合并1 2 csv文件
df_1 = pd.read_csv(f"trip_data_1.csv")
df_2 = pd.read_csv(f"trip_data_2.csv")
df = pd.concat([df_1,df_2],axis=0, sort=False)

#删除空值行和0时长
df = df[~df.dropoff_longitude.isna()]
df = df[df.trip_time_in_secs!=0]

#计算v(Si)
df['speed'] = df['trip_distance']/df['trip_time_in_secs']
reduce_mem_usage(df,['pickup_datetime', 'dropoff_datetime'])
df['pickup_datetime'] = pd.to_datetime(df["pickup_datetime"])
df['dropoff_datetime'] = pd.to_datetime(df["dropoff_datetime"])
df.info()

Memory usage of dataframe is 3503.57 MB/ 3.42GB
Memory usage after optimization is: 1370.82 MB
Decreased by 60.9%
<class 'pandas.core.frame.DataFrame'>
Int64Index: 28701248 entries, 0 to 13990175
Data columns (total 15 columns):
 #   Column              Dtype         
---  ------              -----         
 0   medallion           category      
 1   hack_license        category      
 2   vendor_id           category      
 3   rate_code           int16         
 4   store_and_fwd_flag  category      
 5   pickup_datetime     datetime64[ns]
 6   dropoff_datetime    datetime64[ns]
 7   passenger_count     int16         
 8   trip_time_in_secs   int16         
 9   trip_distance       float16       
 10  pickup_longitude    float16       
 11  pickup_latitude     float16       
 12  dropoff_longitude   float16       
 13  dropoff_latitude    float16       
 14  speed               float16       
dtypes: category(4), datetime64[ns](2), float16(6), int16(3)
memory usage: 1.3 GB


In [7]:
#删除一些异常点
df = df[['pickup_datetime', 'dropoff_datetime','trip_time_in_secs', 'trip_distance',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'speed']]
df = df[~df.speed.isna()]
df[df.isnull().T.any()]

Unnamed: 0,pickup_datetime,dropoff_datetime,trip_time_in_secs,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,speed


In [5]:
#进行分类相同的上车点和下车点
%%time
import collections
p_d_cluster = collections.defaultdict(int)
curr_cluster = 0
# str(pickup_longitude)+'-'+str(pickup_latitude)+'-'+str(dropoff_longitude)+'-'+str(dropoff_latitude)
def cut_lng_lat(pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude):
    global curr_cluster
    if p_d_cluster[str(pickup_longitude)+'-'+str(pickup_latitude)+'-'+str(dropoff_longitude)+'-'+str(dropoff_latitude)]==0:
        curr_cluster += 1
        p_d_cluster[str(pickup_longitude)+'-'+str(pickup_latitude)+'-'+str(dropoff_longitude)+'-'+str(dropoff_latitude)] = curr_cluster
    return p_d_cluster[str(pickup_longitude)+'-'+str(pickup_latitude)+'-'+str(dropoff_longitude)+'-'+str(dropoff_latitude)]

df['cluster'] = df.apply(lambda row: cut_lng_lat(row['pickup_longitude'],row['pickup_latitude'],row['dropoff_longitude'],row['dropoff_latitude']),axis=1)
df.to_pickle(f"trip_data_1and2_fixed.pkl")

CPU times: user 14min 18s, sys: 24.6 s, total: 14min 42s
Wall time: 14min 46s


In [4]:
# df = pd.read_pickle(f"trip_data_1and2_fixed.pkl")
df.reset_index(inplace=True)

In [5]:
print(u'当前进程的内存使用：%.4f GB' % (psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024) )

当前进程的内存使用：1.7091 GB


In [15]:
from sklearn.model_selection import train_test_split
set_random_seed(2020)
X_train, X_test, y_train, y_test = train_test_split(df[['trip_distance',
                                                    'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
                                                    'dropoff_latitude',]],
                                                    df['trip_time_in_secs'], 
                                                    test_size =0.01, random_state=2020)

In [16]:
df['month'] = df.pickup_datetime.dt.month

In [17]:
set(df.month)

{1, 2}

### LR

In [30]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

s_scaler = StandardScaler()
# X_train_lr = s_scaler.fit_transform(X_train)
# X_test_lr = s_scaler.transform(X_test)

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

In [31]:
mean_absolute_error(y_test,y_pred)

216.07986

### AVG

In [21]:
X_test.shape

(287013, 5)

In [22]:
df_train = df.loc[X_train.index,['trip_time_in_secs',"cluster"]]
df_test = df.loc[X_test.index,['trip_time_in_secs','cluster']]
clusters = set(df_test.cluster)

In [None]:
from tqdm import tqdm
for c in tqdm(clusters):
    df_test.loc[df_test.cluster==c,'pred'] = np.mean(df_train.loc[df_train.cluster==c].trip_time_in_secs.values)

In [24]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
df_test = df_test[~df_test.pred.isna()]
mean_absolute_error(df_test.trip_time_in_secs.values,df_test.pred.values)

248.27167800399437

### TEMP rel

In [25]:
def get_week_hour(df):
    df['weekday'] = df.pickup_datetime.dt.weekday
    df['hour'] = df.pickup_datetime.dt.hour
get_week_hour(df)

In [26]:
from tqdm import tqdm
for week in tqdm(range(7)):
    for hour in range(24):
        df.loc[(df.weekday==week)&(df.hour==hour), "speed_timeslot"] = np.mean(df.loc[(df.weekday==week)&(df.hour==hour), "speed"].values)


  0%|          | 0/7 [00:00<?, ?it/s][A
 14%|█▍        | 1/7 [00:07<00:46,  7.67s/it][A
 29%|██▊       | 2/7 [00:14<00:37,  7.40s/it][A
 43%|████▎     | 3/7 [00:21<00:28,  7.21s/it][A
 57%|█████▋    | 4/7 [00:28<00:21,  7.09s/it][A
 71%|███████▏  | 5/7 [00:34<00:13,  7.00s/it][A
 86%|████████▌ | 6/7 [00:41<00:06,  6.93s/it][A
100%|██████████| 7/7 [00:48<00:00,  6.90s/it]


In [27]:
df_train = df.loc[X_train.index,["weekday","hour",'trip_time_in_secs',"cluster",'speed_timeslot']]
df_test = df.loc[X_test.index,["weekday","hour",'trip_time_in_secs','cluster','speed_timeslot']]
# df_train = X_train
# df_test =  X_test.reset_index(drop=True)
clusters = set(df_test.cluster)
df_train.head()

Unnamed: 0,weekday,hour,trip_time_in_secs,cluster,speed_timeslot
6724766,2,19,362,35,0.003609
11300807,1,6,355,1,0.005249
15700064,5,13,900,9,0.003929
13777549,4,9,334,62,0.003099
22207776,6,0,351,1,0.004059


In [None]:
for c in tqdm(clusters):
    Nq_index = df_test.loc[df_test.cluster==c,'speed_timeslot'].index
    tmp = df_train.loc[df_train.cluster==c].trip_time_in_secs.values * df_train.loc[df_train.cluster==c].speed_timeslot.values
    for i in Nq_index:
        # print(c,i)
        df_test.loc[i,'pred'] = np.mean(tmp/(df_test.loc[i,'speed_timeslot']))

In [29]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
df_test = df_test[~df_test.pred.isna()]
mean_absolute_error(df_test.trip_time_in_secs.values,df_test.pred.values)

240.89273213367403

### TEMP abs