In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score


In [2]:
# pgAdmnin interface

In [3]:
# Pull in Data
df = pd.read_csv('Data/salmon_preprocessed.csv')
df.head()

Unnamed: 0,Brood_Year,Nwr_Population_Name,Number_Of_Spawners,Start_Year,End_Year,Effective_Catch,Fracwild,Wild_Spawners,Spawners_Prev_Yr,Eff_Catch_Prev_Yr,Fracwild_Prev_Yr,Wild_Spawners_Prev_Yr,Spawners_Two_Yrs_Prior,Eff_Catch_Two_Yrs_Prior,Fracwild_Two_Yrs_Prior,Wild_Spawners_Two_Yrs_Prior
0,1977,Chinook Salmon (Snake River Fall-run ESU) - Sn...,600.0,1975.0,2008.0,1097.0,1.0,600.0,470.0,617.0,1.0,470.0,1000.0,1.0,1851.0,1000.0
1,1978,Chinook Salmon (Snake River Fall-run ESU) - Sn...,640.0,1975.0,2008.0,1011.0,1.0,640.0,600.0,1097.0,1.0,600.0,470.0,1.0,617.0,470.0
2,1979,Chinook Salmon (Snake River Fall-run ESU) - Sn...,500.0,1975.0,2008.0,514.0,1.0,500.0,640.0,1011.0,1.0,640.0,600.0,1.0,1097.0,600.0
3,1980,Chinook Salmon (Snake River Fall-run ESU) - Sn...,450.0,1975.0,2008.0,159.0,1.0,450.0,500.0,514.0,1.0,500.0,640.0,1.0,1011.0,640.0
4,1981,Chinook Salmon (Snake River Fall-run ESU) - Sn...,340.0,1975.0,2008.0,150.0,1.0,340.0,450.0,159.0,1.0,450.0,500.0,1.0,514.0,500.0


In [4]:
# Isolate the Stream we want to teach the model on and reset index
target_df = df.loc[df['Nwr_Population_Name'] == 'Chinook Salmon (Puget Sound ESU) - Upper Sauk River']
target_df.reset_index(drop=True, inplace=True)
target_df.head()

Unnamed: 0,Brood_Year,Nwr_Population_Name,Number_Of_Spawners,Start_Year,End_Year,Effective_Catch,Fracwild,Wild_Spawners,Spawners_Prev_Yr,Eff_Catch_Prev_Yr,Fracwild_Prev_Yr,Wild_Spawners_Prev_Yr,Spawners_Two_Yrs_Prior,Eff_Catch_Two_Yrs_Prior,Fracwild_Two_Yrs_Prior,Wild_Spawners_Two_Yrs_Prior
0,1954,Chinook Salmon (Puget Sound ESU) - Upper Sauk ...,649.0,,,3999.0,1.0,649.0,269.0,77.0,0.96,258.24,273.0,0.99,208.0,270.27
1,1955,Chinook Salmon (Puget Sound ESU) - Upper Sauk ...,844.0,,,4828.0,1.0,844.0,649.0,3999.0,1.0,649.0,269.0,0.96,77.0,258.24
2,1956,Chinook Salmon (Puget Sound ESU) - Upper Sauk ...,1884.0,,,12920.0,1.0,1884.0,844.0,4828.0,1.0,844.0,649.0,1.0,3999.0,649.0
3,1957,Chinook Salmon (Puget Sound ESU) - Upper Sauk ...,2523.0,,,14630.0,1.0,2523.0,1884.0,12920.0,1.0,1884.0,844.0,1.0,4828.0,844.0
4,1958,Chinook Salmon (Puget Sound ESU) - Upper Sauk ...,636.0,,,2968.0,1.0,636.0,2523.0,14630.0,1.0,2523.0,1884.0,1.0,12920.0,1884.0


In [5]:
# Delete first two rows and reset index
target_df = target_df.drop(target_df.index[[0,1]])
target_df.reset_index(drop=True, inplace=True)
target_df

Unnamed: 0,Brood_Year,Nwr_Population_Name,Number_Of_Spawners,Start_Year,End_Year,Effective_Catch,Fracwild,Wild_Spawners,Spawners_Prev_Yr,Eff_Catch_Prev_Yr,Fracwild_Prev_Yr,Wild_Spawners_Prev_Yr,Spawners_Two_Yrs_Prior,Eff_Catch_Two_Yrs_Prior,Fracwild_Two_Yrs_Prior,Wild_Spawners_Two_Yrs_Prior
0,1956,Chinook Salmon (Puget Sound ESU) - Upper Sauk ...,1884.0,,,12920.0,1.0,1884.0,844.0,4828.0,1.0,844.0,649.0,1.0,3999.0,649.0
1,1957,Chinook Salmon (Puget Sound ESU) - Upper Sauk ...,2523.0,,,14630.0,1.0,2523.0,1884.0,12920.0,1.0,1884.0,844.0,1.0,4828.0,844.0
2,1958,Chinook Salmon (Puget Sound ESU) - Upper Sauk ...,636.0,,,2968.0,1.0,636.0,2523.0,14630.0,1.0,2523.0,1884.0,1.0,12920.0,1884.0
3,1959,Chinook Salmon (Puget Sound ESU) - Upper Sauk ...,740.0,,,5119.0,1.0,740.0,636.0,2968.0,1.0,636.0,2523.0,1.0,14630.0,2523.0
4,1960,Chinook Salmon (Puget Sound ESU) - Upper Sauk ...,3345.0,,,21627.0,1.0,3345.0,740.0,5119.0,1.0,740.0,636.0,1.0,2968.0,636.0
5,1961,Chinook Salmon (Puget Sound ESU) - Upper Sauk ...,3302.0,,,17777.0,1.0,3302.0,3345.0,21627.0,1.0,3345.0,740.0,1.0,5119.0,740.0
6,1962,Chinook Salmon (Puget Sound ESU) - Upper Sauk ...,1643.0,,,11366.0,1.0,1643.0,3302.0,17777.0,1.0,3302.0,3345.0,1.0,21627.0,3345.0
7,1963,Chinook Salmon (Puget Sound ESU) - Upper Sauk ...,1249.0,,,5952.0,1.0,1249.0,1643.0,11366.0,1.0,1643.0,3302.0,1.0,17777.0,3302.0
8,1964,Chinook Salmon (Puget Sound ESU) - Upper Sauk ...,681.0,,,4504.0,1.0,681.0,1249.0,5952.0,1.0,1249.0,1643.0,1.0,11366.0,1643.0
9,1965,Chinook Salmon (Puget Sound ESU) - Upper Sauk ...,2018.0,,,13108.0,1.0,2018.0,681.0,4504.0,1.0,681.0,1249.0,1.0,5952.0,1249.0


In [6]:
# Scale idependent variables
scaler = MinMaxScaler()
scaled_target = target_df [[
    'Spawners_Prev_Yr', 
    'Eff_Catch_Prev_Yr', 
    'Fracwild_Prev_Yr',
    'Wild_Spawners_Prev_Yr',
    'Spawners_Two_Yrs_Prior',
    'Eff_Catch_Two_Yrs_Prior',
    'Fracwild_Two_Yrs_Prior',
    'Wild_Spawners_Two_Yrs_Prior']].values.tolist()

scaled_data =  scaler.fit_transform(scaled_target)
scaled_data

array([[2.27371023e-01, 2.21439496e-01, 1.00000000e+00, 2.28400775e-01,
        1.67130059e-01, 1.00000000e+00, 1.83018955e-01, 1.68240100e-01],
       [5.48656163e-01, 5.96468462e-01, 1.00000000e+00, 5.49257710e-01,
        2.27371023e-01, 1.00000000e+00, 2.21439496e-01, 2.28400775e-01],
       [7.46061168e-01, 6.75719516e-01, 1.00000000e+00, 7.46399615e-01,
        5.48656163e-01, 1.00000000e+00, 5.96468462e-01, 5.49257710e-01],
       [1.63113994e-01, 1.35236595e-01, 1.00000000e+00, 1.64229388e-01,
        7.46061168e-01, 1.00000000e+00, 6.75719516e-01, 7.46399615e-01],
       [1.95242508e-01, 2.34926079e-01, 1.00000000e+00, 1.96315082e-01,
        1.63113994e-01, 1.00000000e+00, 1.35236595e-01, 1.64229388e-01],
       [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.95242508e-01, 1.00000000e+00, 2.34926079e-01, 1.96315082e-01],
       [9.86716095e-01, 8.21569264e-01, 1.00000000e+00, 9.86733800e-01,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.

In [7]:
# Create DatsFrame with scaled data
scaled_df = pd.DataFrame(
    data=scaled_data,
    columns=['Spawners_Prev_Yr_Sc', 
        'Eff_Catch_Prev_Yr_Sc', 
        'Fracwild_Prev_Yr_Sc',
        'Wild_Spawners_Prev_Yr_Sc',
        'Spawners_Two_Yrs_Prior_Sc',
        'Eff_Catch_Two_Yrs_Prior_Sc',
        'Fracwild_Two_Yrs_Prior_Sc',
        'Wild_Spawners_Two_Yrs_Prior_Sc'],
        index=target_df.index)

In [8]:
# Delete columns of unscaled independent variables
target_df.drop(columns = [
    'Spawners_Prev_Yr', 
    'Eff_Catch_Prev_Yr', 
    'Fracwild_Prev_Yr',
    'Wild_Spawners_Prev_Yr',
    'Spawners_Two_Yrs_Prior',
    'Eff_Catch_Two_Yrs_Prior',
    'Fracwild_Two_Yrs_Prior',
    'Wild_Spawners_Two_Yrs_Prior'],
    inplace=True)

# Combine target_df and scaled_df
scaled_target_df = pd.merge(target_df, scaled_df, left_index=True, right_index=True)
scaled_target_df.head()

Unnamed: 0,Brood_Year,Nwr_Population_Name,Number_Of_Spawners,Start_Year,End_Year,Effective_Catch,Fracwild,Wild_Spawners,Spawners_Prev_Yr_Sc,Eff_Catch_Prev_Yr_Sc,Fracwild_Prev_Yr_Sc,Wild_Spawners_Prev_Yr_Sc,Spawners_Two_Yrs_Prior_Sc,Eff_Catch_Two_Yrs_Prior_Sc,Fracwild_Two_Yrs_Prior_Sc,Wild_Spawners_Two_Yrs_Prior_Sc
0,1956,Chinook Salmon (Puget Sound ESU) - Upper Sauk ...,1884.0,,,12920.0,1.0,1884.0,0.227371,0.221439,1.0,0.228401,0.16713,1.0,0.183019,0.16824
1,1957,Chinook Salmon (Puget Sound ESU) - Upper Sauk ...,2523.0,,,14630.0,1.0,2523.0,0.548656,0.596468,1.0,0.549258,0.227371,1.0,0.221439,0.228401
2,1958,Chinook Salmon (Puget Sound ESU) - Upper Sauk ...,636.0,,,2968.0,1.0,636.0,0.746061,0.67572,1.0,0.7464,0.548656,1.0,0.596468,0.549258
3,1959,Chinook Salmon (Puget Sound ESU) - Upper Sauk ...,740.0,,,5119.0,1.0,740.0,0.163114,0.135237,1.0,0.164229,0.746061,1.0,0.67572,0.7464
4,1960,Chinook Salmon (Puget Sound ESU) - Upper Sauk ...,3345.0,,,21627.0,1.0,3345.0,0.195243,0.234926,1.0,0.196315,0.163114,1.0,0.135237,0.164229


In [9]:
# Create our Feature
X = scaled_target_df.drop(columns = ['Nwr_Population_Name', 'Start_Year', 'End_Year', 'Effective_Catch', 'Fracwild', 'Number_Of_Spawners', 'Wild_Spawners'])

# Create our Target
y = scaled_target_df['Wild_Spawners']


In [10]:
# Split data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
#Counter(y_train)

In [11]:
# Set up a regression ML model
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test)

In [13]:
# Score the model
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# Square root of the mean squared
print('Root Mean Squared error of is:',np.sqrt(mean_squared_error(y_test,y_pred)))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))


Coefficients: 
 [-9.52829901e+00 -3.07919857e+04  1.73190996e+03  6.95253068e+02
  3.12185366e+04  4.39034391e+04  9.27255302e+02 -3.26468933e+03
 -4.32640743e+04]
Mean squared error: 267611.05
Root Mean Squared error of is: 517.3113700618229
Coefficient of determination: -1.97
