In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
# pgAdmnin interface

In [3]:
# Pull in Data
df = pd.read_csv('Data/salmon_preprocessed.csv')
df.head()

Unnamed: 0,Brood_Year,Nwr_Population_Name,Number_Of_Spawners,Start_Year,End_Year,Effective_Catch,Fracwild,Wild_Spawners,Spawners_Prev_Yr,Eff_Catch_Prev_Yr,Fracwild_Prev_Yr,Wild_Spawners_Prev_Yr,Spawners_Two_Yrs_Prior,Eff_Catch_Two_Yrs_Prior,Fracwild_Two_Yrs_Prior,Wild_Spawners_Two_Yrs_Prior
0,1977,Chinook Salmon (Snake River Fall-run ESU) - Sn...,600.0,1975.0,2008.0,1097.0,1.0,600.0,470.0,617.0,1.0,470.0,1000.0,1.0,1851.0,1000.0
1,1978,Chinook Salmon (Snake River Fall-run ESU) - Sn...,640.0,1975.0,2008.0,1011.0,1.0,640.0,600.0,1097.0,1.0,600.0,470.0,1.0,617.0,470.0
2,1979,Chinook Salmon (Snake River Fall-run ESU) - Sn...,500.0,1975.0,2008.0,514.0,1.0,500.0,640.0,1011.0,1.0,640.0,600.0,1.0,1097.0,600.0
3,1980,Chinook Salmon (Snake River Fall-run ESU) - Sn...,450.0,1975.0,2008.0,159.0,1.0,450.0,500.0,514.0,1.0,500.0,640.0,1.0,1011.0,640.0
4,1981,Chinook Salmon (Snake River Fall-run ESU) - Sn...,340.0,1975.0,2008.0,150.0,1.0,340.0,450.0,159.0,1.0,450.0,500.0,1.0,514.0,500.0


In [4]:
# Isolate the Stream we want to teach the model on and reset index
target_df = df.loc[df['Nwr_Population_Name'] == 'Chinook Salmon (Puget Sound ESU) - White River']
target_df.reset_index(drop=True, inplace=True)
target_df.head()

Unnamed: 0,Brood_Year,Nwr_Population_Name,Number_Of_Spawners,Start_Year,End_Year,Effective_Catch,Fracwild,Wild_Spawners,Spawners_Prev_Yr,Eff_Catch_Prev_Yr,Fracwild_Prev_Yr,Wild_Spawners_Prev_Yr,Spawners_Two_Yrs_Prior,Eff_Catch_Two_Yrs_Prior,Fracwild_Two_Yrs_Prior,Wild_Spawners_Two_Yrs_Prior
0,1967,Chinook Salmon (Puget Sound ESU) - White River,688.0,,,478.0,1.0,688.0,1828.0,178.0,0.23,420.44,1184.0,0.23,68.0,272.32
1,1968,Chinook Salmon (Puget Sound ESU) - White River,513.0,,,373.0,1.0,513.0,688.0,478.0,1.0,688.0,1828.0,0.23,178.0,420.44
2,1969,Chinook Salmon (Puget Sound ESU) - White River,548.0,,,325.0,1.0,548.0,513.0,373.0,1.0,513.0,688.0,1.0,478.0,688.0
3,1970,Chinook Salmon (Puget Sound ESU) - White River,659.0,,,510.0,1.0,659.0,548.0,325.0,1.0,548.0,513.0,1.0,373.0,513.0
4,1971,Chinook Salmon (Puget Sound ESU) - White River,394.0,,,238.0,1.0,394.0,659.0,510.0,1.0,659.0,548.0,1.0,325.0,548.0


In [5]:
# Delete first two rows and reset index
target_df = target_df.drop(target_df.index[[0,1]])
target_df.reset_index(drop=True, inplace=True)
target_df

Unnamed: 0,Brood_Year,Nwr_Population_Name,Number_Of_Spawners,Start_Year,End_Year,Effective_Catch,Fracwild,Wild_Spawners,Spawners_Prev_Yr,Eff_Catch_Prev_Yr,Fracwild_Prev_Yr,Wild_Spawners_Prev_Yr,Spawners_Two_Yrs_Prior,Eff_Catch_Two_Yrs_Prior,Fracwild_Two_Yrs_Prior,Wild_Spawners_Two_Yrs_Prior
0,1969,Chinook Salmon (Puget Sound ESU) - White River,548.0,,,325.0,1.0,548.0,513.0,373.0,1.0,513.0,688.0,1.0,478.0,688.0
1,1970,Chinook Salmon (Puget Sound ESU) - White River,659.0,,,510.0,1.0,659.0,548.0,325.0,1.0,548.0,513.0,1.0,373.0,513.0
2,1971,Chinook Salmon (Puget Sound ESU) - White River,394.0,,,238.0,1.0,394.0,659.0,510.0,1.0,659.0,548.0,1.0,325.0,548.0
3,1972,Chinook Salmon (Puget Sound ESU) - White River,442.0,,,350.0,1.0,442.0,394.0,238.0,1.0,394.0,659.0,1.0,510.0,659.0
4,1973,Chinook Salmon (Puget Sound ESU) - White River,169.0,,,95.0,1.0,169.0,442.0,350.0,1.0,442.0,394.0,1.0,238.0,394.0
5,1974,Chinook Salmon (Puget Sound ESU) - White River,444.0,,,269.0,1.0,444.0,169.0,95.0,1.0,169.0,442.0,1.0,350.0,442.0
6,1975,Chinook Salmon (Puget Sound ESU) - White River,490.0,,,338.0,1.0,490.0,444.0,269.0,1.0,444.0,169.0,1.0,95.0,169.0
7,1976,Chinook Salmon (Puget Sound ESU) - White River,251.0,,,285.0,1.0,251.0,490.0,338.0,1.0,490.0,444.0,1.0,269.0,444.0
8,1977,Chinook Salmon (Puget Sound ESU) - White River,77.0,,,79.0,1.0,77.0,251.0,285.0,1.0,251.0,490.0,1.0,338.0,490.0
9,1978,Chinook Salmon (Puget Sound ESU) - White River,141.0,,,217.0,1.0,141.0,77.0,79.0,1.0,77.0,251.0,1.0,285.0,251.0


In [6]:
# Scale idependent variables
scaler = StandardScaler()
scaled_target = target_df [[
    'Spawners_Prev_Yr', 
    'Eff_Catch_Prev_Yr', 
    'Fracwild_Prev_Yr',
    'Wild_Spawners_Prev_Yr',
    'Spawners_Two_Yrs_Prior',
    'Eff_Catch_Two_Yrs_Prior',
    'Fracwild_Two_Yrs_Prior',
    'Wild_Spawners_Two_Yrs_Prior']].values.tolist()

scaled_data =  scaler.fit_transform(scaled_target)
scaled_data

array([[-2.17245864e-01,  7.81035550e-02,  7.50994947e-01,
        -8.15474867e-02, -1.03758703e-02,  7.13404739e-01,
         2.65614593e-01,  2.01564736e-01],
       [-1.77471120e-01, -1.40303051e-02,  7.50994947e-01,
        -2.51681819e-02, -2.09495052e-01,  7.13404739e-01,
         6.40035163e-02, -8.04056890e-02],
       [-5.13283601e-02,  3.41068947e-01,  7.50994947e-01,
         1.53634756e-01, -1.69671215e-01,  7.13404739e-01,
        -2.81615472e-02, -2.40116039e-02],
       [-3.52479993e-01, -1.81022927e-01,  7.50994947e-01,
        -2.73237123e-01, -4.33727632e-02,  7.13404739e-01,
         3.27057968e-01,  1.54838208e-01],
       [-2.97931773e-01,  3.39560804e-02,  7.50994947e-01,
        -1.95916934e-01, -3.44896095e-01,  7.13404739e-01,
        -1.95210725e-01, -2.72145578e-01],
       [-6.08174776e-01, -4.55505051e-01,  7.50994947e-01,
        -6.35675511e-01, -2.90280548e-01,  7.13404739e-01,
         1.98410901e-02, -1.94805119e-01],
       [-2.95658930e-01, -1.215198

In [7]:
# Create DatsFrame with scaled data
scaled_df = pd.DataFrame(
    data=scaled_data,
    columns=['Spawners_Prev_Yr_Sc', 
        'Eff_Catch_Prev_Yr_Sc', 
        'Fracwild_Prev_Yr_Sc',
        'Wild_Spawners_Prev_Yr_Sc',
        'Spawners_Two_Yrs_Prior_Sc',
        'Eff_Catch_Two_Yrs_Prior_Sc',
        'Fracwild_Two_Yrs_Prior_Sc',
        'Wild_Spawners_Two_Yrs_Prior_Sc'],
        index=target_df.index)

In [8]:
# Delete columns of unscaled independent variables
target_df.drop(columns = [
    'Spawners_Prev_Yr', 
    'Eff_Catch_Prev_Yr', 
    'Fracwild_Prev_Yr',
    'Wild_Spawners_Prev_Yr',
    'Spawners_Two_Yrs_Prior',
    'Eff_Catch_Two_Yrs_Prior',
    'Fracwild_Two_Yrs_Prior',
    'Wild_Spawners_Two_Yrs_Prior'],
    inplace=True)

# Combine target_df and scaled_df
scaled_target_df = pd.merge(target_df, scaled_df, left_index=True, right_index=True)
scaled_target_df.head()

Unnamed: 0,Brood_Year,Nwr_Population_Name,Number_Of_Spawners,Start_Year,End_Year,Effective_Catch,Fracwild,Wild_Spawners,Spawners_Prev_Yr_Sc,Eff_Catch_Prev_Yr_Sc,Fracwild_Prev_Yr_Sc,Wild_Spawners_Prev_Yr_Sc,Spawners_Two_Yrs_Prior_Sc,Eff_Catch_Two_Yrs_Prior_Sc,Fracwild_Two_Yrs_Prior_Sc,Wild_Spawners_Two_Yrs_Prior_Sc
0,1969,Chinook Salmon (Puget Sound ESU) - White River,548.0,,,325.0,1.0,548.0,-0.217246,0.078104,0.750995,-0.081547,-0.010376,0.713405,0.265615,0.201565
1,1970,Chinook Salmon (Puget Sound ESU) - White River,659.0,,,510.0,1.0,659.0,-0.177471,-0.01403,0.750995,-0.025168,-0.209495,0.713405,0.064004,-0.080406
2,1971,Chinook Salmon (Puget Sound ESU) - White River,394.0,,,238.0,1.0,394.0,-0.051328,0.341069,0.750995,0.153635,-0.169671,0.713405,-0.028162,-0.024012
3,1972,Chinook Salmon (Puget Sound ESU) - White River,442.0,,,350.0,1.0,442.0,-0.35248,-0.181023,0.750995,-0.273237,-0.043373,0.713405,0.327058,0.154838
4,1973,Chinook Salmon (Puget Sound ESU) - White River,169.0,,,95.0,1.0,169.0,-0.297932,0.033956,0.750995,-0.195917,-0.344896,0.713405,-0.195211,-0.272146


In [11]:
# Create our Feature
X = scaled_target_df.drop(columns = ['Nwr_Population_Name', 'Start_Year', 'End_Year', 'Effective_Catch', 'Fracwild', 'Number_Of_Spawners', 'Wild_Spawners'])

# Create our Target
y = scaled_target_df['Wild_Spawners']


In [12]:
# Split data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
#Counter(y_train)

In [13]:
# Set up a regression ML model
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test)

In [14]:
# Score the model
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# Square root of the mean squared
print('Root Mean Squared error of is:',np.sqrt(mean_squared_error(y_test,y_pred)))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))


Coefficients: 
 [   2.12539657 1048.35505995 -349.3177448    43.1499541   142.75371879
 -721.85947769   -9.35497244  158.66817241  394.7828315 ]
Mean squared error: 1132760.01
Root Mean Squared error of is: 1064.3119908658377
Coefficient of determination: -2.65


In [None]:
# year = target_df.Brood_Year.values.reshape(-1, 1)

In [None]:
# plt.scatter(year, y)
# plt.plot(year, y_pred, color='red')
# plt.show()