In [1]:
# Import database dependencies
from sqlalchemy import inspect, create_engine
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
import config as creds

# Import Pandas and matplotlib dependencies
import pandas as pd
import numpy as np
import datetime as dt
from pathlib import Path
import matplotlib.pyplot as plt

# Import scikit packages
import sklearn.preprocessing as preprocessing
from sklearn.linear_model import LinearRegression
import sklearn.datasets as datasets
# For splitting of data into train and test set
from sklearn.model_selection import train_test_split
# Metrics for Evaluation of model Accuracy and F1-score
from sklearn.metrics  import f1_score,accuracy_score
import sklearn.metrics as metrics


In [2]:
#!pip install psycopg2

In [3]:
#!pip install psycopg2-binary 

In [4]:
# Create engine
engine = create_engine(f'postgresql://{creds.PGUSER}:{creds.PGPASSWORD}@{creds.PGHOST}:5432/{creds.PGDATABASE}')

In [5]:
# Create our session (link) from Python to the DB
session = Session(bind=engine.connect())

In [6]:
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)

In [7]:
# List tables in database
inspect(engine).get_table_names()

['ave_wage_indexing',
 'welfare_education',
 'cpi_inflation_rate',
 'crime_rate',
 'economic_features_full',
 'economic_features',
 'divorce_rate',
 'homeownership_rate',
 'min_wage_effective',
 'poverty_rates',
 'unemployment_rate']

In [8]:
# List columns in a specific table ('min_wage')
[column['name'] for column in inspect(engine).get_columns('economic_features')]

['year',
 'state',
 'population_million',
 'education_million',
 'welfare_million',
 'crime_rate',
 'unemployment_rate',
 'divorce_rate_per_1000_people',
 'homeownership_rate',
 'minimum_wage_effective',
 'cpi_average',
 'avg_wage_index',
 'poverty_rate']

In [9]:
# Testing
# List columns in a specific table ('min_wage')
[column['name'] for column in inspect(engine).get_columns('economic_features_full')]

['year',
 'state',
 'population_million',
 'education_million',
 'welfare_million',
 'crime_rate',
 'unemployment_rate',
 'divorce_rate_per_1000_people',
 'homeownership_rate',
 'minimum_wage_effective',
 'cpi_average',
 'avg_wage_index',
 'poverty_rate']

In [10]:
# Testing
# List columns in a specific table ('min_wage')
[column['name'] for column in inspect(engine).get_columns('ave_wage_indexing')]

['Year', 'awi', 'annual_change']

In [11]:
# Testing
# Getting a list of tables

from sqlalchemy.sql import and_
s = select([ave_wage_indexing]).where(and_(year = '2000')
for row in conn.execute(s)
  print row

SyntaxError: Missing parentheses in call to 'print'. Did you mean print(row)? (<ipython-input-11-6cb4574e2e58>, line 7)

In [None]:
# Defining function that takes in a table name and outputs a dataframe
def db_reader(tablename):
    table_df = pd.read_sql_table(f'{tablename}', engine)
    return table_df

In [None]:
# Testing
# Reading a database table into a dataframe
unemployment_rate = db_reader('unemployment_rate')
unemployment_rate.head()

In [None]:
# Testing
# Reading a database table into a dataframe
poverty_rates = db_reader('poverty_rates')
poverty_rates.head()

In [None]:
# Reading a database table into a dataframe
econ_features_full = db_reader('economic_features_full')
econ_features_full.head()

In [None]:
# plt.scatter(econ_features_full.avg_wage_index,econ_features_full.poverty_rate )
# plt.xlabel('avg_wage_index')
# plt.ylabel('poverty_rate')
# plt.show()

# xlabel = independent variable
# ylabel = target variable that we want to predict 

In [None]:
# Drop the non-beneficial columns and drop any rows with null or "not a number" (NaN) values
model_df = econ_features_full.drop(columns=["state"], axis=1).dropna()
model_df.head()

In [None]:
# Define X and y variables
X = model_df.drop(["poverty_rate"], axis=1)#.values.reshape(-1, 1)
y = model_df["poverty_rate"]

In [None]:
X.head()

In [None]:
X.shape

In [None]:
model_df.dtypes

In [None]:
# Standardize data
import sklearn.preprocessing as preprocessing

std = preprocessing.StandardScaler()
# X is a matrix
std.fit(X)
X_std = std.transform(X)

In [None]:
# Split the dataset into training and testing sets 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1)

In [None]:
# Create a model with scikit-learn
model = LinearRegression()

In [None]:
# Fit the data into the model
# By convention, X is capitalized and y is lowercase
model.fit(X, y)

In [None]:
# We use the predict() on the model to predict the output
# The model creates predicted y values based on X values
y_pred = model.predict(X)
print(y_pred.shape)
print(X.shape)


# for regression we use R2 score and MAE(mean absolute error)
# from sklearn.metrics import mean_absolute_error
# from sklearn.metrics import r2_score
# print(mean_absolute_error(y_test,y_pred))
# print(mean_absolute_error(y_test,y_pred))

In [None]:
# Plot the results. The best fit line is red.
plt.scatter(X, y)
plt.plot(X, y_pred, color='red')
plt.show()

In [None]:
# The slope
# The y-intercept
print(model.coef_)
print(model.intercept_)

In [None]:
# Import linear regression from the SciPy stats module.
from scipy.stats import linregress

# Perform linear regression.
(slope, intercept, r_value, p_value, std_err) = linregress(lats, temps)
# Get the equation of the line.
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
print(line_eq)
print(f"The p-value is: {p_value:.3f}")
# linregress function used to calculate the slope, y-intercept, correlation coefficient (r-value), p-value, and 
# standard deviation, and then we'll print out the equation for the line.