In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
# pgAdmnin interface

In [3]:
# Pull in Data
df = pd.read_csv('Data/salmon_pre_processed.csv')
df.head()

Unnamed: 0,Brood_Year,Nwr_Population_Name,Number_Of_Spawners,Start_Year,End_Year,Effective_Catch,Fracwild,Spawners_Prev_Yr,Eff_Catch_Prev_Yr,Fracwild_Prev_Yr
0,1976,Chinook Salmon (Snake River Fall-run ESU) - Sn...,470.0,1975.0,2008.0,617.0,1.0,1000.0,1851.0,1.0
1,1977,Chinook Salmon (Snake River Fall-run ESU) - Sn...,600.0,1975.0,2008.0,1097.0,1.0,470.0,617.0,1.0
2,1978,Chinook Salmon (Snake River Fall-run ESU) - Sn...,640.0,1975.0,2008.0,1011.0,1.0,600.0,1097.0,1.0
3,1979,Chinook Salmon (Snake River Fall-run ESU) - Sn...,500.0,1975.0,2008.0,514.0,1.0,640.0,1011.0,1.0
4,1980,Chinook Salmon (Snake River Fall-run ESU) - Sn...,450.0,1975.0,2008.0,159.0,1.0,500.0,514.0,1.0


In [4]:
# Isolate the Stream we want to teach the model on and reset index
target_df = df.loc[df['Nwr_Population_Name'] == 'Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal']
target_df.reset_index(drop=True, inplace=True)

# Delete first row and reset index
target_df = target_df.drop(target_df.index[0])
target_df.reset_index(drop=True, inplace=True)
target_df

Unnamed: 0,Brood_Year,Nwr_Population_Name,Number_Of_Spawners,Start_Year,End_Year,Effective_Catch,Fracwild,Spawners_Prev_Yr,Eff_Catch_Prev_Yr,Fracwild_Prev_Yr
0,1971,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,1340.0,,,8467.0,0.73,1157.0,7187.0,0.79
1,1972,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,551.0,,,2779.0,0.77,1340.0,8467.0,0.73
2,1973,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,913.0,,,6988.0,0.77,551.0,2779.0,0.77
3,1974,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,347.0,,,1278.0,0.77,913.0,6988.0,0.77
4,1975,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,954.0,,,4596.0,0.78,347.0,1278.0,0.77
5,1976,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,347.0,,,1144.0,0.49,954.0,4596.0,0.78
6,1977,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,735.0,,,5176.0,0.67,347.0,1144.0,0.49
7,1978,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,56.0,,,519.0,0.53,735.0,5176.0,0.67
8,1979,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,676.0,,,11906.0,0.71,56.0,519.0,0.53
9,1980,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal,245.0,,,3627.0,0.72,676.0,11906.0,0.71


In [5]:
# Split stream name and year
pop_name = target_df.filter(['Brood_Year', 'Nwr_Population_Name'], axis=1)
pop_name.head()

Unnamed: 0,Brood_Year,Nwr_Population_Name
0,1971,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal
1,1972,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal
2,1973,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal
3,1974,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal
4,1975,Chinook Salmon (Puget Sound ESU) - Mid-Hood Canal


In [6]:
target_df = target_df.drop(['Nwr_Population_Name', 'Start_Year', 'End_Year'], axis=1)
target_df

Unnamed: 0,Brood_Year,Number_Of_Spawners,Effective_Catch,Fracwild,Spawners_Prev_Yr,Eff_Catch_Prev_Yr,Fracwild_Prev_Yr
0,1971,1340.0,8467.0,0.73,1157.0,7187.0,0.79
1,1972,551.0,2779.0,0.77,1340.0,8467.0,0.73
2,1973,913.0,6988.0,0.77,551.0,2779.0,0.77
3,1974,347.0,1278.0,0.77,913.0,6988.0,0.77
4,1975,954.0,4596.0,0.78,347.0,1278.0,0.77
5,1976,347.0,1144.0,0.49,954.0,4596.0,0.78
6,1977,735.0,5176.0,0.67,347.0,1144.0,0.49
7,1978,56.0,519.0,0.53,735.0,5176.0,0.67
8,1979,676.0,11906.0,0.71,56.0,519.0,0.53
9,1980,245.0,3627.0,0.72,676.0,11906.0,0.71


In [7]:
# Create our Feature
X = target_df.drop(columns="Number_Of_Spawners")

# Create our Target
y = target_df["Number_Of_Spawners"]

In [8]:
# Split data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
#Counter(y_train)

In [9]:
# Set up a regression ML model
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, y_train)

# Make predictions using the testing set
y_pred = regr.predict(X_test)

In [10]:
# Score the model
print("Coefficients: \n", regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# Square root of the mean squared
print('Root Mean Squared error of is:',np.sqrt(mean_squared_error(y_test,y_pred)))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))


Coefficients: 
 [ 2.94041448e+00  8.47371439e-02  4.94649150e+01  4.19174785e-01
 -2.44296943e-02  8.90366821e+01]
Mean squared error: 9094.57
Root Mean Squared error of is: 95.36543884723007
Coefficient of determination: 0.85


In [11]:
# year = target_df.Brood_Year.values.reshape(-1, 1)

In [12]:
# plt.scatter(year, y)
# plt.plot(year, y_pred, color='red')
# plt.show()