In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
# pgAdmnin interface

In [3]:
# Pull in Data
df = pd.read_csv('Data/salmon_preprocessed.csv')
df.head()

Unnamed: 0,Brood_Year,Nwr_Population_Name,Number_Of_Spawners,Start_Year,End_Year,Effective_Catch,Fracwild,Spawners_Prev_Yr,Eff_Catch_Prev_Yr,Fracwild_Prev_Yr,Spawners_Two_Yrs_Prior,Eff_Catch_Two_Yrs_Prior,Fracwild_Two_Yrs_Prior
0,1977,Chinook Salmon (Snake River Fall-run ESU) - Sn...,600.0,1975.0,2008.0,1097.0,1.0,470.0,1.0,1.0,1000.0,1.0,1.0
1,1978,Chinook Salmon (Snake River Fall-run ESU) - Sn...,640.0,1975.0,2008.0,1011.0,1.0,600.0,1.0,1.0,470.0,1.0,1.0
2,1979,Chinook Salmon (Snake River Fall-run ESU) - Sn...,500.0,1975.0,2008.0,514.0,1.0,640.0,1.0,1.0,600.0,1.0,1.0
3,1980,Chinook Salmon (Snake River Fall-run ESU) - Sn...,450.0,1975.0,2008.0,159.0,1.0,500.0,1.0,1.0,640.0,1.0,1.0
4,1981,Chinook Salmon (Snake River Fall-run ESU) - Sn...,340.0,1975.0,2008.0,150.0,1.0,450.0,1.0,1.0,500.0,1.0,1.0


In [4]:
# Isolate the Stream we want to teach the model on and reset index
target_df = df.loc[df['Nwr_Population_Name'] == 'Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal']
target_df.reset_index(drop=True, inplace=True)
target_df.head()

Unnamed: 0,Brood_Year,Nwr_Population_Name,Number_Of_Spawners,Start_Year,End_Year,Effective_Catch,Fracwild,Spawners_Prev_Yr,Eff_Catch_Prev_Yr,Fracwild_Prev_Yr,Spawners_Two_Yrs_Prior,Eff_Catch_Two_Yrs_Prior,Fracwild_Two_Yrs_Prior
0,1970,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,1157.0,,,7187.0,0.79,6664.0,0.96,0.96,5290.0,0.93,0.93
1,1971,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,1340.0,,,8467.0,0.73,1157.0,0.79,0.79,6664.0,0.96,0.96
2,1972,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,551.0,,,2779.0,0.77,1340.0,0.73,0.73,1157.0,0.79,0.79
3,1973,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,913.0,,,6988.0,0.77,551.0,0.77,0.77,1340.0,0.73,0.73
4,1974,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,347.0,,,1278.0,0.77,913.0,0.77,0.77,551.0,0.77,0.77


In [5]:
# Delete first two rows and reset index
target_df = target_df.drop(target_df.index[[0,1]])
target_df.reset_index(drop=True, inplace=True)
target_df

Unnamed: 0,Brood_Year,Nwr_Population_Name,Number_Of_Spawners,Start_Year,End_Year,Effective_Catch,Fracwild,Spawners_Prev_Yr,Eff_Catch_Prev_Yr,Fracwild_Prev_Yr,Spawners_Two_Yrs_Prior,Eff_Catch_Two_Yrs_Prior,Fracwild_Two_Yrs_Prior
0,1972,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,551.0,,,2779.0,0.77,1340.0,0.73,0.73,1157.0,0.79,0.79
1,1973,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,913.0,,,6988.0,0.77,551.0,0.77,0.77,1340.0,0.73,0.73
2,1974,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,347.0,,,1278.0,0.77,913.0,0.77,0.77,551.0,0.77,0.77
3,1975,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,954.0,,,4596.0,0.78,347.0,0.77,0.77,913.0,0.77,0.77
4,1976,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,347.0,,,1144.0,0.49,954.0,0.78,0.78,347.0,0.77,0.77
5,1977,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,735.0,,,5176.0,0.67,347.0,0.49,0.49,954.0,0.78,0.78
6,1978,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,56.0,,,519.0,0.53,735.0,0.67,0.67,347.0,0.49,0.49
7,1979,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,676.0,,,11906.0,0.71,56.0,0.53,0.53,735.0,0.67,0.67
8,1980,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,245.0,,,3627.0,0.72,676.0,0.71,0.71,56.0,0.53,0.53
9,1981,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,175.0,,,2524.0,0.89,245.0,0.72,0.72,676.0,0.71,0.71


In [6]:
# Scale idependent variables
scaler = StandardScaler()
scaled_target = target_df [[
    'Spawners_Prev_Yr', 
    'Eff_Catch_Prev_Yr', 
    'Fracwild_Prev_Yr', 
    'Spawners_Two_Yrs_Prior',
    'Eff_Catch_Two_Yrs_Prior',
    'Fracwild_Two_Yrs_Prior']].values.tolist()

scaled_data =  scaler.fit_transform(scaled_target)
scaled_data

array([[ 2.67932564, -0.10651484, -0.10651484,  2.00353985,  0.24836902,
         0.24836902],
       [ 0.52703803,  0.16588377,  0.16588377,  2.47751194, -0.24836902,
        -0.24836902],
       [ 1.51452613,  0.16588377,  0.16588377,  0.43399294,  0.08278967,
         0.08278967],
       [-0.02944698,  0.16588377,  0.16588377,  1.37157707,  0.08278967,
         0.08278967],
       [ 1.62636871,  0.23398342,  0.23398342, -0.09436939,  0.08278967,
         0.08278967],
       [-0.02944698, -1.74090651, -1.74090651,  1.47776754,  0.16557935,
         0.16557935],
       [ 1.02896568, -0.51511276, -0.51511276, -0.09436939, -2.23532119,
        -2.23532119],
       [-0.82325648, -1.4685079 , -1.4685079 ,  0.91055504, -0.74510706,
        -0.74510706],
       [ 0.86802149, -0.24271415, -0.24271415, -0.84806271, -1.90416249,
        -1.90416249],
       [-0.30768949, -0.17461449, -0.17461449,  0.75774436, -0.41394837,
        -0.41394837],
       [-0.49864023,  0.98307961,  0.98307961, -0.

In [7]:
# Create DatsFrame with scaled data
scaled_df = pd.DataFrame(
    data=scaled_data,
    columns=['Spawners_Prev_Yr_Sc', 
        'Eff_Catch_Prev_Yr_Sc', 
        'Fracwild_Prev_Yr_Sc', 
        'Spawners_Two_Yrs_Prior_Sc',
        'Eff_Catch_Two_Yrs_Prior_Sc',
        'Fracwild_Two_Yrs_Prior_Sc'],
    index=target_df.index)

In [8]:
# Delete columns of unscaled independent variables
target_df.drop(columns = [
    'Spawners_Prev_Yr', 
    'Eff_Catch_Prev_Yr', 
    'Fracwild_Prev_Yr', 
    'Spawners_Two_Yrs_Prior',
    'Eff_Catch_Two_Yrs_Prior',
    'Fracwild_Two_Yrs_Prior'],
    inplace=True)

# Combine target_df and scaled_df
scaled_target_df = pd.merge(target_df, scaled_df, left_index=True, right_index=True)
scaled_target_df.head()

Unnamed: 0,Brood_Year,Nwr_Population_Name,Number_Of_Spawners,Start_Year,End_Year,Effective_Catch,Fracwild,Spawners_Prev_Yr_Sc,Eff_Catch_Prev_Yr_Sc,Fracwild_Prev_Yr_Sc,Spawners_Two_Yrs_Prior_Sc,Eff_Catch_Two_Yrs_Prior_Sc,Fracwild_Two_Yrs_Prior_Sc
0,1972,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,551.0,,,2779.0,0.77,2.679326,-0.106515,-0.106515,2.00354,0.248369,0.248369
1,1973,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,913.0,,,6988.0,0.77,0.527038,0.165884,0.165884,2.477512,-0.248369,-0.248369
2,1974,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,347.0,,,1278.0,0.77,1.514526,0.165884,0.165884,0.433993,0.08279,0.08279
3,1975,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,954.0,,,4596.0,0.78,-0.029447,0.165884,0.165884,1.371577,0.08279,0.08279
4,1976,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,347.0,,,1144.0,0.49,1.626369,0.233983,0.233983,-0.094369,0.08279,0.08279


In [9]:
# Create our Feature
X = target_df.drop(columns = ['Nwr_Population_Name', 'Start_Year', 'End_Year', 'Effective_Catch', 'Fracwild', 'Number_Of_Spawners'])

# Create our Target
y = target_df["Number_Of_Spawners"]

In [10]:
# Split data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
#Counter(y_train)

In [11]:
# Set up a regression ML model
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test)

In [12]:
# Score the model
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# Square root of the mean squared
print('Root Mean Squared error of is:',np.sqrt(mean_squared_error(y_test,y_pred)))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))


Coefficients: 
 [-12.52925653]
Mean squared error: 41759.19
Root Mean Squared error of is: 204.3506438785857
Coefficient of determination: 0.27


In [13]:
# year = target_df.Brood_Year.values.reshape(-1, 1)

In [14]:
# plt.scatter(year, y)
# plt.plot(year, y_pred, color='red')
# plt.show()