In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score, balanced_accuracy_score 
import psycopg2
from sqlalchemy import create_engine

ModuleNotFoundError: No module named 'psycopg2'

In [None]:
# pgAdmnin interface
# create dbstring
db_string = f"postgresql://salmonteam:Napapa1215$@salmonanalysis.cs4hgx1gpbbz.us-east-1.rds.amazonaws.com:5432/postgres"

In [None]:
# create engine
engine = create_engine(db_string)

In [None]:
# connect table to notebook
connection = engine.connect()

# create dataframe

df = pd.read_sql("SELECT * FROM preprocessed_salmon_data", connection)
df = df.drop('index', 1)

df.head()

In [None]:
# Pull in Data
# df = pd.read_csv('Data/salmon_preprocessed_ws.csv')
# df.head()

In [None]:
# Isolate the Stream we want to teach the model on and reset index
wenatchee_df = df.loc[df['Nwr_Population_Name'] == 'Chinook Salmon (Upper Columbia River Spring-run ESU) - Wenatchee River']
wenatchee_df.reset_index(drop=True, inplace=True)
wenatchee_df.info()

In [None]:
# Delete first three rows and reset index
wenatchee_df = wenatchee_df.drop(wenatchee_df.index[[0,1,3]])
wenatchee_df.reset_index(drop=True, inplace=True)
wenatchee_df

In [None]:
# Plot Brood Year vs Wild Spawners
plt.plot(wenatchee_df['Brood_Year'], wenatchee_df['Wild_Spawners'], 'ro')
#plt.axis([1948, 2020, 0, 1000])
plt.show()

In [None]:
# Create our Feature for target_df
# X = scaled_target_df[['Wild_Spawners_Prev_Yr_Sc', 'Wild_Spawners_Two_Yrs_Prior_Sc', 'Wild_Spawners_Three_Yrs_Prior_Sc']]
X = wenatchee_df[['Brood_Year', 'Wild_Spawners_Prev_Yr', 'Wild_Spawners_Two_Yrs_Prior', 'Wild_Spawners_Three_Yrs_Prior']]
# Create our Target
y = wenatchee_df['Wild_Spawners']

In [None]:
# Split data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=112, test_size =0.2)
#Counter(y_train)

In [None]:
# Set up a regression ML model
# Create linear regression object
l_regr = linear_model.LinearRegression()

# Train the model using the training sets
l_regr.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = l_regr.predict(X_test)

In [None]:
print("Coefficients: \n", l_regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# Square root of the mean squared
print('Root Mean Squared error of is:',np.sqrt(mean_squared_error(y_test,y_pred)))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))
# Accuracy Score
l_regr.score(X_test, y_test)

In [None]:
wenatchee_df.tail()

In [None]:
# Predict Wild Spawners for 2012
# Create a row to append to target_df
row_2012 = {
    'Brood_Year': 2012,
    'Nwr_Population_Name': 'Chinook Salmon (Upper Columbia River Spring-run ESU) - Wenatchee River',
    'Number_Of_Spawners': 'NaN',
    'Fracwild': 'NaN',
    'Wild_Spawners': 'NaN',
    'Wild_Spawners_Prev_Yr': 1225.90,
    'Wild_Spawners_Two_Yrs_Prior': 968.22,
    'Wild_Spawners_Three_Yrs_Prior': 294.30
    }
wenatchee_df = wenatchee_df.append(row_2012, ignore_index = True)
wenatchee_df.tail()

In [None]:
# Create our Feature for prediction
X_predict =  wenatchee_df[['Brood_Year', 'Wild_Spawners_Prev_Yr', 'Wild_Spawners_Two_Yrs_Prior', 'Wild_Spawners_Three_Yrs_Prior']]

predictions_2012 = l_regr.predict(X_predict)
predictions_2012

In [None]:
# Predict Wild Spawners for 2013
# Create a row to append to target_df
row_2013 = {
    'Brood_Year': 2013,
    'Nwr_Population_Name': 'Chinook Salmon (Upper Columbia River Spring-run ESU) - Wenatchee River',
    'Number_Of_Spawners': 'NaN',
    'Fracwild': 'NaN',
    'Wild_Spawners': 'NaN',
    'Wild_Spawners_Prev_Yr': 113.53,
    'Wild_Spawners_Two_Yrs_Prior': 1225.90,
    'Wild_Spawners_Three_Yrs_Prior': 968.22
    }
wenatchee_df = wenatchee_df.append(row_2013, ignore_index = True)
wenatchee_df.tail()

In [None]:
# Create our Feature for prediction
X_predict =  wenatchee_df[['Brood_Year', 'Wild_Spawners_Prev_Yr', 'Wild_Spawners_Two_Yrs_Prior', 'Wild_Spawners_Three_Yrs_Prior']]

predictions_2013 = l_regr.predict(X_predict)
predictions_2013

In [None]:
# Predict Wild Spawners for 2014
# Create a row to append to target_df
row_2014 = {
    'Brood_Year': 2014,
    'Nwr_Population_Name': 'Chinook Salmon (Upper Columbia River Spring-run ESU) - Wenatchee River',
    'Number_Of_Spawners': 'NaN',
    'Fracwild': 'NaN',
    'Wild_Spawners': 'NaN',
    'Wild_Spawners_Prev_Yr': -298.89,
    'Wild_Spawners_Two_Yrs_Prior': 113.53,
    'Wild_Spawners_Three_Yrs_Prior': 1225.90
    }
wenatchee_df = wenatchee_df.append(row_2014, ignore_index = True)
wenatchee_df.tail()

In [None]:
# Create our Feature for prediction
X_predict =  wenatchee_df[['Brood_Year', 'Wild_Spawners_Prev_Yr', 'Wild_Spawners_Two_Yrs_Prior', 'Wild_Spawners_Three_Yrs_Prior']]

predictions_2014 = l_regr.predict(X_predict)

# Add Predictions to Data_Frame
predict_df = wenatchee_df[['Brood_Year', 'Nwr_Population_Name', 'Number_Of_Spawners', 'Fracwild', 'Wild_Spawners']]
predict_df['Predicted_Wild_Spawners'] = predictions_2014

predict_df

In [None]:
# create csv
predict_df.to_csv('Data/wenatchee_predict.csv',index=False)

In [None]:
import time

In [None]:
# read into SQL
rows_imported = 0
start_time = time.time()
for data in pd.read_csv(f'Data/wenatchee_predict.csv', chunksize=10000):
    
    print(f'importing rows {rows_imported} to {rows_imported + len(data)}...', end='')
    data.to_sql(name='wenatchee_predict', con=engine, if_exists='append')
    rows_imported += len(data)
    
    print(f'Done. {time.time() - start_time} total seconds elapsed')