### pandas study

학습 날짜 : 2019년 06월 10일, 어느 늦은 밤

학습 내용 : pandas itterows()

학습 이유 : pandas가 느려서 좀 더 빠르게 쓰고 싶고, 까먹어서.

학습 참고
- [판다스 코드 속도 최적화를 위한 초보자 안내서](https://aldente0630.github.io/data-science/2018/08/05/a-beginners-guide-to-optimizing-pandas-code-for-speed.html)
- [pandas iterrows()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.iterrows.html)

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])

In [3]:
row = next(df.iterrows())[1]
row

int      1.0
float    1.5
Name: 0, dtype: float64

In [4]:
iris = sns.load_dataset('iris')

In [5]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


### for 문으로 돌아봅시다.

In [6]:
%%timeit


df_length = len(iris)
for i in range(df_length):
    iris.loc[i, 'sepal_length']
#     print(iris.loc[i, 'sepal_length'])

768 µs ± 7.32 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


### iterrows로 돌아봅시다.

In [29]:
%%timeit

for row, index in iris.iterrows(): # row는 숫자, index는 column을 받는 듯
    index['sepal_length']

7.39 ms ± 136 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


----------------------

# 판다스 코드 속도 최적화를 위한 초보자 안내서

In [2]:
import numpy as np
import pandas as pd
from math import *

In [4]:
df = pd.read_csv('new_york_hotels.csv')
df.head()

Unnamed: 0,ean_hotel_id,name,address1,city,state_province,postal_code,latitude,longitude,star_rating,high_rate,low_rate
0,269955,Hilton Garden Inn Albany/SUNY Area,1389 Washington Ave,Albany,NY,12206,42.68751,-73.81643,3.0,154.0272,124.0216
1,113431,Courtyard by Marriott Albany Thruway,1455 Washington Avenue,Albany,NY,12206,42.68971,-73.82021,3.0,179.01,134.0
2,108151,Radisson Hotel Albany,205 Wolf Rd,Albany,NY,12205,42.7241,-73.79822,3.0,134.17,84.16
3,254756,Hilton Garden Inn Albany Medical Center,62 New Scotland Ave,Albany,NY,12208,42.65157,-73.77638,3.0,308.2807,228.4597
4,198232,CrestHill Suites SUNY University Albany,1415 Washington Avenue,Albany,NY,12206,42.68873,-73.81854,3.0,169.39,89.39


In [5]:
def normalize(df, pd_series):
    pd_series = pd_series.astype(float)
    
    avg = np.mean(pd_series)
    std = np.std(pd_series)
    
    lower_bound = avg - 2 * std
    upper_bound = avg + 2 * std
    
    df.loc[pd_series < lower_bound, "cutoff_rate"] = lower_bound
    df.loc[pd_series > upper_bound, "cutoff_rate"] = upper_bound
    
    normalized_price = np.log(df["cutoff_rate"].astype(float))
    
    return normalized_price

In [6]:
%timeit df['high_rate_normalized'] = normalize(df, df['high_rate'])

3.91 ms ± 295 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### haversine definition

In [11]:
def haversine(lat1, lon1, lat2, lon2):
    miles_constant = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    mi = miles_constant * c

    return mi

### iterrows haversine

In [15]:
%%timeit

haversine_series = []

for index, row in df.iterrows():
    haversine_series.append(haversine(40.671, -73.985, row['latitude'], row['longitude']))
    
df['distance'] = haversine_series

151 ms ± 3.02 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### apply Haversine on rows

In [16]:
%timeit df['distance'] =\
df.apply(lambda row: haversine(40.671, -73.985, row['latitude'], row['longitude']), axis=1)

47.4 ms ± 199 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Vectorized implementation of Haversine applied on Pandas series

### timing vectorized implementation

In [18]:
# Vectorized implementation of Haversine applied on Pandas series
%timeit df['distance'] = haversine(40.671, -73.985, df['latitude'], df['longitude'])

1.91 ms ± 8.55 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [19]:
%%timeit

# Convert pandas arrays to NumPy ndarrays
np_lat = df['latitude'].values
np_lon = df['longitude'].values

3.33 µs ± 13.1 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


### 넘파이 배열을 사용한 벡터화

In [23]:
%%timeit

df['distance'] = haversine(40.671, -73.985, df['latitude'].values, df['longitude'].values)

211 µs ± 2.44 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Cythonize that loop

In [20]:
%load_ext cython

In [21]:
%%cython -a

# Haversine cythonized (no other edits)
import numpy as np
cpdef haversine_cy(lat1, lon1, lat2, lon2):
    miles_constant = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    mi = miles_constant * c
    return mi

In [22]:
%timeit df['distance'] = df.apply(lambda row: haversine_cy(40.671, -73.985, row['latitude'], row['longitude']), axis=1)

47.2 ms ± 158 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
