In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#load packages needed
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
font_name = font_manager.FontProperties(fname="/content/drive/Shared drives/gh_new_zone/--").get_name()
plt.rc('font', family=font_name)

In [None]:
#load 'weekly profit' data
profit1=pd.read_csv('/content/drive/Shared drives/gh_new_zone/---.csv')

In [None]:
#Data Preprocessing('profit')
 #pick out data which dist_type is ransfer zone
transfer=profit[(profit.dist_type=='TRANSFER_STATION')|(profit.dist_type=='TRANSFER_SUBWAY')|(profit.dist_type=='TRANSFER_TERMINAL')|(profit.dist_type=='TRANSFER_BUS')]
transfer.head()

 #group by region & zone
profit_g=transfer.groupby(['region1','region2','region3','zone_id','dist_type']).mean()
profit_g=profit_g.reset_index()
profit_g.head()

In [None]:
#load 'reservation info' data
reserve=pd.read_csv('/content/drive/Shared drives/gh_new_zone/--.csv')
reserve.head()

In [None]:
#Merge the datasets('profit'&'reserve')
total2=pd.merge(profit_g,reserve)
total2.head()
total2.to_csv('total2.csv',index=False) #save csv file

In [None]:
#pick out transfer zone & non-transfer zone in 'total2'
total_transfer=total2[(total2.dist_type=='TRANSFER_STATION')|(total2.dist_type=='TRANSFER_SUBWAY')|(total2.dist_type=='TRANSFER_TERMINAL')|(total2.dist_type=='TRANSFER_BUS')]
not_transfer=total2[(total2.dist_type!='TRANSFER_STATION')&(total2.dist_type!='TRANSFER_SUBWAY')&(total2.dist_type!='TRANSFER_TERMINAL')&(total2.dist_type!='TRANSFER_BUS')]

In [None]:
#average of utime, distance, rev_total in transfer zone
print(total_transfer.utime.sum()/len(total_transfer))
print(total_transfer.distance.sum()/len(total_transfer))
print(total_transfer.rev_total.sum()/len(total_transfer))

In [None]:
#average of utime, distance, rev_total in non-transfer zone
print(not_transfer.utime.sum()/len(not_transfer))
print(not_transfer.distance.sum()/len(not_transfer))
print(not_transfer.rev_total.sum()/len(not_transfer))

# transfer zone analysis - hypothesis testing

In [None]:
#Hypothesis Testing
#t-test of age, utime, distance and rev_total between transfer zone & non-transfer zone

#age
#Levene의 등분산 검정 
from scipy import stats
lresult = stats.levene(total_transfer.age, not_transfer.age)
print('LeveneResult(F) : %.3f \np-value : %.3f' % (lresult)) #이분산
#등분산이 아닌 독립표본 t-검정 실행 
result = stats.ttest_ind(total_transfer.age, not_transfer.age, equal_var=False) 
print('t statistic : %.3f \np-value : %.3f' % (result))

#utime
#Levene의 등분산 검정 
from scipy import stats
lresult = stats.levene(total_transfer.utime, not_transfer.utime)
print('LeveneResult(F) : %.3f \np-value : %.3f' % (lresult)) #이분산
#등분산이 아닌 독립표본 t-검정 실행 (utime)
result = stats.ttest_ind(total_transfer.utime, not_transfer.utime, equal_var=False) 
print('t statistic : %.3f \np-value : %.3f' % (result))

#distance
#Levene의 등분산 검정 
from scipy import stats
lresult = stats.levene(total_transfer.distance, not_transfer.distance)
print('LeveneResult(F) : %.3f \np-value : %.3f' % (lresult)) #이분산
#등분산이 아닌 독립표본 t-검정 실행 (distance)
result = stats.ttest_ind(total_transfer.distance, not_transfer.distance, equal_var=False) 
print('t statistic : %.3f \np-value : %.3f' % (result))

#rev_total
#Levene의 등분산 검정 
from scipy import stats
lresult = stats.levene(total_transfer.rev_total, not_transfer.rev_total)
print('LeveneResult(F) : %.3f \np-value : %.3f' % (lresult)) #이분산
#등분산이 아닌 독립표본 t-검정 실행 (rev_total)
result = stats.ttest_ind(total_transfer.rev_total, not_transfer.rev_total, equal_var=False) 
print('t statistic : %.3f \np-value : %.3f' % (result))

In [None]:
#correlation analysis in transfer zone
total_transfer.corr()

In [None]:
import seaborn as sns
plt.figure(figsize=(10,10))
sns.heatmap(data = total_transfer.corr(), annot=True, fmt = '.2f', linewidths=.5, cmap='Blues')

In [None]:
not_transfer.corr()

In [None]:
#scatter plots
total_transfer.plot(x='age',y='rev_total',kind='scatter',figsize=(15,10))
total_transfer.plot(x='age',y='distance',kind='scatter',figsize=(15,10))
total_transfer.plot(x='age',y='utime',kind='scatter',figsize=(15,10))

# car model anaysis in transfer zone

In [None]:
#proportion of car model in transfer zone
transfer_g=total_transfer.groupby(['zone_id','car_model','region1','region2','region3'])['distance','utime'].mean()
transfer_g=transfer_g.reset_index()
prop=pd.DataFrame(transfer_g.car_model.value_counts().sort_values(ascending=False))
prop['prop']=transfer_g.car_model.value_counts()/len(transfer_g)
print(prop)

In [None]:
#Chi-square test(independent test) whether they have any correalation between region & car model
col=['zone_id','car_model','car_id']
trans=transfer_g[col]
trans.head()

contingency_table=trans.pivot_table('car_id','zone_id','car_model',aggfunc='count',fill_value=0)
Observed_Values = contingency_table.values  #Observed Values
import scipy.stats
b=scipy.stats.chi2_contingency(contingency_table)
Expected_Values = b[3] #Expected Values

#Degree of Freedom
no_of_rows=len(contingency_table.iloc[0:2,0])
no_of_columns=len(contingency_table.iloc[0,0:2])
df=(no_of_rows-1)*(no_of_columns-1)
print("Degree of Freedom:",df)

#chi-square statistic - χ2
from scipy.stats import chi2
chi_square=sum([(o-e)**2./e for o,e in zip(Observed_Values,Expected_Values)])
chi_square_statistic=chi_square[0]+chi_square[1]
print("chi-square statistic:-",chi_square_statistic)

#p-value
p_value=1-chi2.cdf(x=chi_square_statistic,df=df)
print('p-value:',p_value)

In [None]:
#utime, distance, rev_total  by region
transfer_g2=total_transfer.groupby(['zone_id','region1','region2','region3'])['distance','utime','rev_total'].mean()
plt.figure(figsize=(7,7))
sns.heatmap(data = transfer_g2.corr(), annot=True, fmt = '.2f', linewidths=.5, cmap='Greens')

In [None]:
#Sorting by utime
rank=transfer_g2.sort_values(by='utime',ascending=False)
rank=rank.reset_index()
rank2=rank.head(50)
rank2.groupby(['region1','region2'])['zone_id'].count().sort_values(ascending=False).head(5)