## 뉴욕시의 Taxi 데이터를 활용한 예측 모델
---
- 데이터셋 : 캐글의 NYC_taxi.csv
- 거리에 따른 요금 예측 ==> 회귀
    * 조건 : 뉴욕시 내
- 승차에 따른 하차 지역 예측 ==> 분류
    * 조건 : 뉴욕시 내

### [1] 데이터 로딩

In [46]:
# nrows : 로딩 데이터 수
# parse_dates : 데이터 불러올때 datetime 형태로 불러올 수 있음
import pandas as pd
df=pd.read_csv('train.csv', nrows=50000, parse_dates=['pickup_datetime'])

In [47]:
df.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21+00:00,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16+00:00,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00+00:00,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42+00:00,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00+00:00,-73.968095,40.768008,-73.956655,40.783762,1


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   key                50000 non-null  object             
 1   fare_amount        50000 non-null  float64            
 2   pickup_datetime    50000 non-null  datetime64[ns, UTC]
 3   pickup_longitude   50000 non-null  float64            
 4   pickup_latitude    50000 non-null  float64            
 5   dropoff_longitude  50000 non-null  float64            
 6   dropoff_latitude   50000 non-null  float64            
 7   passenger_count    50000 non-null  int64              
dtypes: datetime64[ns, UTC](1), float64(5), int64(1), object(1)
memory usage: 3.1+ MB


In [49]:
df=df.drop(columns=['key'])

In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   fare_amount        50000 non-null  float64            
 1   pickup_datetime    50000 non-null  datetime64[ns, UTC]
 2   pickup_longitude   50000 non-null  float64            
 3   pickup_latitude    50000 non-null  float64            
 4   dropoff_longitude  50000 non-null  float64            
 5   dropoff_latitude   50000 non-null  float64            
 6   passenger_count    50000 non-null  int64              
dtypes: datetime64[ns, UTC](1), float64(5), int64(1)
memory usage: 2.7 MB


In [51]:
df.columns

Index(['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count'],
      dtype='object')

### [2] 결측치 및 이상치 체크

In [52]:
df.isnull().sum()

fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [53]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,11.364171,-72.509756,39.933759,-72.504616,39.926251,1.66784
std,9.685557,10.39386,6.224857,10.40757,6.014737,1.289195
min,-5.0,-75.423848,-74.006893,-84.654241,-74.006377,0.0
25%,6.0,-73.992062,40.73488,-73.991152,40.734372,1.0
50%,8.5,-73.98184,40.752678,-73.980082,40.753372,1.0
75%,12.5,-73.967148,40.76736,-73.963584,40.768167,2.0
max,200.0,40.783472,401.083332,40.851027,43.41519,6.0


### 특성 새롭게 생성 => 거리 Feature
---
- 공식 : sqrt((x2-x1)^2+(y2-y1)^2)

In [54]:
import numpy as np
df['distance']=np.sqrt((df['dropoff_latitude']-df['pickup_latitude'])**2+(df['dropoff_longitude']-df['pickup_longitude'])**2)

In [55]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance
0,4.5,2009-06-15 17:26:21+00:00,-73.844311,40.721319,-73.84161,40.712278,1,0.009436
1,16.9,2010-01-05 16:52:16+00:00,-74.016048,40.711303,-73.979268,40.782004,1,0.079696
2,5.7,2011-08-18 00:35:00+00:00,-73.982738,40.76127,-73.991242,40.750562,2,0.013674
3,7.7,2012-04-21 04:30:42+00:00,-73.98713,40.733143,-73.991567,40.758092,1,0.02534
4,5.3,2010-03-09 07:51:00+00:00,-73.968095,40.768008,-73.956655,40.783762,1,0.01947
