In [3]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
NY_df = pd.read_csv('data/newyork_dataset', sep=',',low_memory=False, header=0, encoding='utf8')
CHI_df = pd.read_csv('data/chicago_dataset', sep=',',low_memory=False, header=0, encoding='utf8')
BOS_df = pd.read_csv('data/boston_dataset', sep=',',low_memory=False, header=0, encoding='utf8')

Here the aim is to find some correlations between the stations of each of the three datasets, first separately then jointly, throught clustering technique based on Machine Learning.

In this case the algorithm chosen is the K-Means clustering provided by scikitlearn library.

### Importing necessary libraries

In [4]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

## Dataset preprocessing

The idea here is to give in input to the algorithm a matrix, where each column represents the vector of a station with the index i corresponding to an hour of the day and the value v[i] to the bike flow in the station in that hour.

This flow is positive if the number of incoming bikes is greater than the number of outgoing ones, negative flow otherwise.

### Input Matrix

In [83]:
def matrix_creation(df):
    
    in_df = df[['end station name', 'start hour', 'tripduration']].groupby(['end station name', 'start hour']).count()
    out_df = df[['start station name', 'start hour', 'tripduration']].groupby(['start station name', 'start hour']).count()
    in_df = in_df.reset_index()
    out_df = out_df.reset_index()
    t_df = pd.merge(left=out_df, right=in_df, how='outer', left_on=['start station name', 'start hour'],
                    right_on=['end station name', 'start hour'])
    t_df['tripduration_x'] = t_df['tripduration_x'].fillna(0)
    t_df['tripduration_y'] = t_df['tripduration_y'].fillna(0)
    t_df['flow'] = t_df['tripduration_y'] - t_df['tripduration_x']
    
    t_df.loc[t_df['end station name'].isnull(), 'end station name'] = t_df.loc[t_df['end station name'].isnull(),
                                                                               'start station name']
    t_df = t_df[['end station name', 'start hour', 'flow', 'tripduration_y', 'tripduration_x']]
    t_df.columns = ['station', 'start hour', 'flow', 'in_flow', 'out_flow']
    
    temp = pd.DataFrame(columns = t_df['start hour'].drop_duplicates())
    temp['station'] = t_df['station'].drop_duplicates()
    
    for row in t_df.iterrows():
        temp.loc[temp['station'] == row[1]['station'], row[1]['start hour']] = row[1]['flow']
    
    temp = temp.reset_index()
    temp = temp.fillna(0.0)
    
    return temp[['station'] + (list(range(0,24)))]
    # return temp[['station'] + (list(range(0,24)))].T
    
NY_matrix = matrix_creation(NY_df)
CHI_matrix = matrix_creation(CHI_df)
BOS_matrix = matrix_creation(BOS_df)

display(NY_matrix)
display(CHI_matrix)
display(BOS_matrix)

start hour,station,0,1,2,3,4,5,6,7,8,...,14,15,16,17,18,19,20,21,22,23
0,1 Ave & E 110 St,8.0,7.0,8.0,1.0,-7.0,-6.0,-12.0,-37.0,-77.0,...,-22.0,1.0,12.0,34.0,5.0,46.0,7.0,29.0,22.0,9.0
1,1 Ave & E 16 St,14.0,13.0,10.0,-2.0,-2.0,-23.0,-184.0,-232.0,-546.0,...,15.0,35.0,50.0,359.0,310.0,223.0,159.0,84.0,110.0,16.0
2,1 Ave & E 18 St,10.0,13.0,7.0,-11.0,-2.0,-49.0,-143.0,-316.0,-412.0,...,15.0,35.0,70.0,263.0,377.0,127.0,22.0,22.0,33.0,13.0
3,1 Ave & E 30 St,-1.0,0.0,-2.0,21.0,17.0,40.0,316.0,154.0,3.0,...,-58.0,-123.0,-161.0,-15.0,12.0,31.0,-43.0,8.0,0.0,5.0
4,1 Ave & E 44 St,-3.0,1.0,5.0,-3.0,3.0,25.0,0.0,184.0,612.0,...,-14.0,-31.0,-189.0,-546.0,-122.0,-48.0,-8.0,15.0,9.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
786,Morris Canal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
787,NYCBS DEPOT - DELANCEY,0.0,2.0,0.0,0.0,0.0,0.0,1.0,2.0,4.0,...,3.0,5.0,7.0,4.0,8.0,5.0,5.0,3.0,5.0,2.0
788,Newport Pkwy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
789,Paulus Hook,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


start hour,station,0,1,2,3,4,5,6,7,8,...,14,15,16,17,18,19,20,21,22,23
0,2112 W Peterson Ave,0.0,0.0,0.0,8.0,0.0,10.0,0.0,0.0,0.0,...,-2.0,1.0,0.0,0.0,0.0,1.0,4.0,0.0,0.0,0.0
1,63rd St Beach,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,...,3.0,4.0,3.0,-1.0,-3.0,-11.0,-3.0,0.0,0.0,1.0
2,900 W Harrison St,-1.0,1.0,0.0,1.0,0.0,-4.0,13.0,33.0,48.0,...,-1.0,-36.0,-50.0,-27.0,-22.0,2.0,-28.0,-3.0,-1.0,1.0
3,Aberdeen St & Jackson Blvd,0.0,1.0,0.0,0.0,-1.0,-28.0,-16.0,-21.0,-43.0,...,-7.0,-17.0,69.0,36.0,9.0,22.0,8.0,1.0,0.0,2.0
4,Aberdeen St & Monroe St,2.0,0.0,-1.0,2.0,-1.0,-12.0,-47.0,-39.0,-34.0,...,13.0,28.0,60.0,39.0,14.0,4.0,-8.0,1.0,2.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
568,Woodlawn Ave & 55th St,0.0,2.0,-1.0,0.0,0.0,1.0,-2.0,-5.0,17.0,...,4.0,-9.0,-9.0,24.0,3.0,1.0,-4.0,-12.0,-2.0,-1.0
569,Woodlawn Ave & Lake Park Ave,0.0,0.0,0.0,0.0,0.0,-2.0,-7.0,-5.0,-21.0,...,-3.0,-1.0,3.0,20.0,5.0,1.0,4.0,3.0,0.0,0.0
570,Yates Blvd & 75th St,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-3.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
571,Bennett Ave & 79th St,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


start hour,station,0,1,2,3,4,5,6,7,8,...,14,15,16,17,18,19,20,21,22,23
0,175 N Harvard St,-1.0,1.0,2.0,-3.0,0.0,1.0,10.0,-16.0,-38.0,...,3.0,1.0,33.0,15.0,19.0,-8.0,7.0,4.0,3.0,3.0
1,18 Dorrance Warehouse,0.0,2.0,3.0,2.0,2.0,0.0,0.0,3.0,22.0,...,4.0,4.0,-16.0,-1.0,6.0,3.0,1.0,1.0,1.0,1.0
2,191 Beacon St,-1.0,1.0,0.0,3.0,0.0,-3.0,-5.0,-27.0,-76.0,...,0.0,12.0,25.0,46.0,31.0,19.0,34.0,16.0,11.0,5.0
3,30 Dane St,-1.0,-6.0,1.0,0.0,-2.0,0.0,7.0,-19.0,-18.0,...,-6.0,11.0,16.0,31.0,37.0,12.0,-7.0,-10.0,-10.0,2.0
4,359 Broadway - Broadway at Fayette Street,9.0,1.0,1.0,2.0,1.0,-21.0,-19.0,-72.0,-172.0,...,-7.0,16.0,34.0,31.0,76.0,62.0,39.0,22.0,21.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263,Watermark Seaport - Boston Wharf Rd at Seaport...,-1.0,-3.0,-5.0,0.0,0.0,11.0,41.0,144.0,93.0,...,17.0,-29.0,-94.0,-120.0,-36.0,1.0,-4.0,-6.0,2.0,-1.0
264,Wentworth Institute of Technology - Huntington...,-6.0,-5.0,0.0,1.0,1.0,-1.0,-2.0,4.0,6.0,...,12.0,-22.0,-19.0,-22.0,5.0,6.0,-19.0,-5.0,-2.0,5.0
265,West End Park,-2.0,3.0,0.0,0.0,0.0,1.0,-1.0,-88.0,-324.0,...,7.0,19.0,161.0,232.0,30.0,27.0,25.0,-5.0,4.0,3.0
266,Williams St at Washington St,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-9.0,-13.0,...,4.0,-1.0,11.0,5.0,12.0,16.0,1.0,0.0,3.0,1.0


### Scaling

## Choosing number of clusters k

## K-Means algorithm