# Analysis of SF311 data regarding human and animal waste

## This notebook investigates which year San Francisco experienced the highest concentration of waste cases in the most recent decade (2009-2019)

In [1]:
import pandas as pd 
import geopandas as gpd
from matplotlib import pyplot as plt

ModuleNotFoundError: No module named 'geopandas'

In [None]:
import requests

In [None]:
import csv

In [None]:
import os
data_dir = os.environ['DATA_DIR']
waste_fp = os.path.join(data_dir, 'processed', 'human_animal_waste_311.csv')
with open(waste_fp,'r') as source_file:
    text = source_file.read()

## Reads the csv file

In [None]:
df = pd.read_csv(waste_fp) 

In [None]:
df.head() 

## Detects the missing values in the array and indicates them

In [None]:
pd.isnull(df).any() 

## Finds the year element within the date column

In [None]:
df['year'] = pd.DatetimeIndex(df['Opened']).year 

In [None]:
df.head()

## Returns the number of occurrences

In [None]:
waste_by_years = df.groupby('year').count() 

In [None]:
waste_by_years.sort_values('year').reset_index()

In [None]:
%matplotlib inline

In [None]:
by_years = waste_by_years.sort_values('year').reset_index() 

In [None]:
by_years.head()

## Plots the graph 

In [None]:
by_years.plot(kind = 'scatter', x='year', y='Opened') 

In [None]:
by_years.head(13)

# Lists the 10 years (2009-2019) that we are looking at

In [None]:
by_years[1:12] 

In [None]:
decade = by_years[1:12]

In [None]:
import pandas as pd

## This renames the column to "Cases" since it was previously labelled as "Opened".

In [None]:
decade.rename(columns={'Opened':'Cases'}, inplace = True) 

In [None]:
decade.head(12)

## This renames the column name from "year" to "Year".

In [None]:
decade.rename(columns = {'year':'Year'}, inplace = True) 

In [None]:
decade.head(12)

## Creates scatter plot of number cases as years pass by

In [None]:
decade.plot(kind = 'scatter', x='Year', y='Cases') 

In [None]:
pip install scikit-learn 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression

## This plots a linear regression curve.

In [None]:
Year = decade.iloc[:,0].values.reshape(-1,1) 
Cases = decade.iloc[:,1].values.reshape(-1,1)
linear_regressor = LinearRegression()
linear_regressor.fit(Year, Cases)
Cases_pred = linear_regressor.predict(Year)

In [None]:
plt.scatter(Year, Cases)
plt.plot(Year, Cases_pred, color='red')
plt.show() 

## This formalizes the titles/display of the graph. 

In [None]:
plt.scatter(Year, Cases)
plt.plot (Year, Cases_pred, color = 'red')
plt.xlabel('Year')
plt.ylabel('Cases of Human or Animal Waste')
plt.title('Cases of Human or Animal Waste between 2009-2019') 
plt.show() 