Join the data from Part 1 with the data from Part 2 to create a new dataframe.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns                      
import matplotlib.pyplot as plt             
%matplotlib inline     
sns.set(color_codes=True)

In [None]:
df_city_bikes = pd.read_csv('city_bikes_portland.csv')
df_foursquare = pd.read_csv('foursquare.csv')
df_yelp = pd.read_csv('yelp.csv')
#joining on a composite key of columns 'bike_station_location','name','address' inorder to get a unique key to join on
df_first_join = pd.merge(df_yelp, df_foursquare, on=['bike_station_location','name','address'], how='inner')
df_final_join = pd.merge(df_first_join, df_city_bikes, on='bike_station_location', how='left')
df_final_join

In [None]:
#Refined the resultant merged dataframe through the removal of extraneous columns, while retaining 'main_category_y' for its potential to offer more detailed category combinations. Retained 'latitude' and 'longitude' as they hold the potential for future analysis. Assessed and addressed missing values by opting to retain rows containing NaN values, given the potential influence of other data in the same row on the statistical model. Scrutinized and resolved duplicate rows. Additionally, identified and addressed potential outliers. In this context, exclusively considered distance values that lie beyond the 1000m radius as outliers, aligning with the intention to exclude data outside this radius from the statistical model.
df_final_join.isnull().sum()

In [None]:
#this results shows that there at 0 duplicated rows, so nothing was dropped
df_final_join.duplicated().sum()

In [None]:
#checking to see if there are any outliers in the distance col, since these are the only outliers
#that would be good to remove from the dataset since we only want data within 1000m
sns.boxplot(x=df_final_cleaned['distance_away_x'])
print(np.where((df_final_cleaned['distance_away_x']>1000) & (df_final_cleaned['distance_away_x']<0)))
#this shows that there are no distance values > 1000m

In [None]:
df_final_cleaned = df_final_join.drop(['distance_away_y'], axis=1)
df_final_cleaned

In [None]:
#exporting to csv file for future use
df_final_cleaned.to_csv('joined_data.csv', index=False)

Provide a visualization that you used as part of your EDA process. Explain the initial pattern or relationship you discoved through this visualization. 

In [None]:
#from this boxplot we can see that there are a few distances that are considered outliers based on our dataset distribution,
#but none of them are over 1000m, so they were kept in since the dataset is suppose to include everything up to 1000m
sns.boxplot(x=df_final_cleaned['distance_away_x'])

In [None]:
#from this scatter plot of distance away vs review count, the initial pattern discovered with this visualization is that
#there are more higher number review counts for places that are closer to a bikestation, which is expected.
fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(df_final_cleaned['distance_away_x'], df_final_cleaned['review_count'])
ax.set_xlabel('distance_away')
ax.set_ylabel('review_count')
plt.show()

In [None]:
#from this scatter plot of total possible bikes vs review count, the initial pattern discovered with this visualization is that
#there does not seem to be a pattern or relationship between the total possible bikes at a station and the number of reviews a business 
#within a 1000m radius of the bike station gets.
fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(df_final_cleaned['num_of_bikes'], df_final_cleaned['review_count'])
ax.set_xlabel('num_of_bikes')
ax.set_ylabel('review_count')
plt.show()

# Database

Put all your results in an SQLite3 database (remember, SQLite stores its databases as files in your local machine - make sure to create your database in your project's data/ directory!)

In [None]:
import sqlite3
from sqlite3 import Error

def create_connection(path):
    connection = None
    try:
        connection = sqlite3.connect(path)
        print("Connection to SQLite DB successful")
    except Error as e:
        print(f"The error '{e}' occurred")

    return connection

In [None]:
connection = create_connection("../data/city_bike_POI.sqlite")

In [None]:
create_city_bikes_portland_table = """
CREATE TABLE IF NOT EXISTS city_bikes_portland (
  bike_station_location TEXT,
  latitude FLOAT,
  longitude FLOAT,
  num_of_bikes INTEGER
);
"""

In [None]:
conn = sqlite3.connect("../data/city_bike_POI.sqlite")
c = conn.cursor()

In [None]:
c.execute(create_city_bikes_portland_table)

In [None]:
df_city_bikes.to_sql('city_bikes_portland', conn, if_exists='replace', index=False)

In [None]:
create_foursquare_table = """
CREATE TABLE IF NOT EXISTS foursquare (
  bike_station_location TEXT,
  name TEXT,
  main_category TEXT,
  address TEXT,
  distance_away INTEGER
);
"""

In [None]:
c.execute(create_foursquare_table)

In [None]:
df_foursquare.to_sql('foursquare', conn, if_exists='replace', index=False)

In [None]:
#c.execute("""SELECT * FROM foursquare""")
#c.fetchall()
create_yelp_table = """
CREATE TABLE IF NOT EXISTS yelp (
  bike_station_location TEXT,
  name TEXT,
  main_category TEXT,
  address TEXT,
  price TEXT,
  rating FLOAT,
  review_count INTEGER,
  status TEXT,
  distance_away FLOAT
);
"""

In [None]:
c.execute(create_yelp_table)

In [None]:
df_yelp.to_sql('yelp', conn, if_exists='replace', index=False)

In [None]:
create_joined_table = """
CREATE TABLE IF NOT EXISTS joined (
  bike_station_location TEXT,
  name TEXT,
  main_category_x TEXT,
  address TEXT,
  price TEXT,
  rating FLOAT,
  review_count INTEGER,
  status TEXT,
  distance_away_x FLOAT,
  main_category_y TEXT,
  latitude FLOAT,
  longitude FLOAT,
  num_of_bikes INTEGER
);
"""

In [None]:
c.execute(create_joined_table)

In [None]:
df_final_cleaned.to_sql('joined', conn, if_exists='replace', index=False)

In [None]:
#checking to see if data was inserted correctly
c.execute("""SELECT * FROM joined""")
c.fetchall()

Look at the data before and after the join to validate your data.

In [None]:
#checking to see if NaN values exist in the data before and after joining
print(df_final_cleaned.isnull().values.any(), df_city_bikes.isnull().values.any(), df_foursquare.isnull().values.any(),
      df_yelp.isnull().values.any())
#we can see that NaN values existed in 2 of the 3 dataframes that were joined and the joined data framed also has NaN values,
#which validates our join. Rows with NaN values were kept in since other data in those rows can have an impact on the statistical model

In [None]:
#checking to see if there are any duplicate rows in our resulting joined data
df_final_cleaned.duplicated().values.any()
#we can see that there are no duplicated rows in the final joined data, which valids the composite key that the tables were joined on

In [None]:
#checking to see if the number of rows in the data after joining is less than the rows from the 2 tables joined with 
#the composite key
print(df_foursquare.shape, df_yelp.shape, df_final_cleaned.shape)
#We can see that the joined dataframe has much less rows, which makes sense and validates that our composite key 
#was infact a good unique key