In [None]:
##Import necessary packages
import os
import numpy as np
import networkx as nx
import pandas as pd
from pandas.plotting import scatter_matrix, parallel_coordinates
import seaborn as sns
from sklearn import preprocessing
import matplotlib.pylab as plt
##Pull first 9 observations from dataset
housing_df = pd.read_csv('../../.venv/lib/Datasets/HousingData.csv')
housing_df['CAT_MEDV'] = np.where(housing_df['MEDV'] > 30, 1, 0) ## adjust dataset by adding a column with required info
housing_df.head()

In [None]:
##Import amtrak dataset and convert to time series
amtrak_df = pd.read_csv('../../.venv/lib/Datasets/Amtrak.csv')
amtrak_df['Date'] = pd.to_datetime(amtrak_df.Month, format='%b-%y')
ridership_ts = pd.Series(amtrak_df.Ridership.values, index=amtrak_df.Date)

In [None]:
##PANDAS VERSION OF GRAPHING
##Line Graph
ridership_ts.plot(ylim=[1300,2300], legend=False)
plt.xlabel('Year')
plt.ylabel('Ridership')
plt.show


In [None]:
##Scatter Plot
housing_df.plot.scatter(x='LSTAT', y='MEDV', legend=False)

In [None]:
##Bar chart of CHAS vs mean MEDV
ax =housing_df.groupby('CHAS').mean().plot(kind='bar')
ax.set_ylabel('Avg. MEDV')

In [None]:
##Bar chart of CHAS vs CAT_MEDV
datafp = housing_df.groupby('CHAS').mean()['CAT_MEDV']*100
ax = datafp.plot(kind='bar', figsize= [5,3])
ax.set_ylabel('% of CAT.MEDV')

In [None]:
##HISTOGRAM
ax = housing_df.MEDV.hist()
ax.set_xlabel('MEDV'); ax.set_ylabel('count')

In [None]:
##BOXPLOT
ax = housing_df.boxplot(column='MEDV', by='CHAS')
ax.set_ylabel('MEDV')
plt.suptitle('') ##suppresses the titles
plt.title('')

In [None]:
##MATPLOTLIB Version of Code
##Line Graph
plt.plot(ridership_ts.index, ridership_ts)
plt.xlabel('Year')
plt.ylabel('Ridership (in 000s)')

In [None]:
##Scatter Plot; change color of dots and unfill circles
plt.scatter(housing_df.LSTAT, housing_df.MEDV, color='C2',facecolor='none')
plt.xlabel('LSTAT'); plt.ylabel('MEDV')

In [None]:
##Barchart of CHAS vs mean MEDV
dataforp = housing_df.groupby('CHAS').mean().MEDV
fig, ax = plt.subplots()
ax.bar(dataforp.index, dataforp, color=['C5','C1'])
ax.set_xticks([0, 1])
ax.set_xlabel('CHAS')
ax.set_ylabel('Avg. MEDV')

In [None]:
##Barchart of CHAS vs CAT.MEDV
dataforp = housing_df.groupby('CHAS').mean()['CAT_MEDV']*100
fig, ax = plt.subplots()
ax.bar(dataforp.index, dataforp, color=['C5','C1'])
ax.set_xticks([0, 1])
ax.set_xlabel('CHAS'); ax.set_ylabel('% of CAT.MEDV')

In [None]:
##HISTOGRAM
fig, ax = plt.subplots()
ax.hist(housing_df.MEDV)
ax.set_axisbelow(True)##shows gridlines behind histogram
ax.grid(which='major', color='grey', linestyle='--')
ax.set_xlabel('MEDV'); ax.set_ylabel('count')
plt.show()

In [None]:
##BOXPLOT
dataforp = [list(housing_df[housing_df.CHAS==0].MEDV), list(housing_df[housing_df.CHAS==1].MEDV)]
fig, ax = plt.subplots()
ax.boxplot(dataforp)
ax.set_xticks([1,2])
ax.set_xticklabels([0,1])
ax.set_xlabel('CHAS'); ax.set_ylabel('MEDV')
plt.show()

In [None]:
##Side-By-Side Boxplots
fig, axes = plt.subplots(nrows=1, ncols=4)
housing_df.boxplot(column='NOX', by='CAT_MEDV', ax=axes[0])
housing_df.boxplot(column='LSTAT', by='CAT_MEDV', ax=axes[1])
housing_df.boxplot(column='PTRATIO', by='CAT_MEDV', ax=axes[2])
housing_df.boxplot(column='INDUS', by='CAT_MEDV', ax=axes[3])
for ax in axes:
    ax.set_xlabel('CAT_MEDV')
plt.suptitle('') ##suppress overall title
plt.tight_layout() ##increase speration between plots

In [None]:
##Heatmaps
##Simple heatmap of correlation without values
corr = housing_df.corr()
sns.heatmap(corr, xticklabels=corr.columns,yticklabels=corr.columns)

In [None]:
##Divergent Scale Colormap, edit range
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, vmin=-1, vmax=1, cmap="RdBu")

In [None]:
##Heatmap with size editing and labels
fig, ax=plt.subplots()
fig.set_size_inches(11, 7)
sns.heatmap(corr, annot=True, fmt=".1f", center=0, ax=ax)

In [None]:
##Heatmap of missing values
df = pd.read_csv('../../.venv/lib/Datasets/NYPD_COLLISIONS.csv', low_memory=False)
##Create array, if value is missing, 1, if value is there, 0
nainfo = np.zeros(df.shape)
nainfo[df.isna().values]=1
nainfo = pd.DataFrame(nainfo, columns=df.columns)

In [None]:
##Plot missing value binary array
fig, ax=plt.subplots()
fig.set_size_inches(13, 9)
ax=sns.heatmap(nainfo, vmin=0, vmax=1, cmap=["white", "#666666"], cbar=False, ax=ax)
ax.set_yticks([])

In [None]:
##Draw Frame around figure output
rect = plt.Rectangle((0, 0), nainfo.shape[1], nainfo.shape[0], linewidth=1, edgecolor='lightgrey', facecolor='none')
rect = ax.add_patch(rect)
rect.set_clip_on(False)
plt.xticks(rotation=80)