In [None]:
#imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [None]:
#import data
data = pd.read_csv('bottle.csv', delimiter=',', low_memory=False)

#remove blank values from dataset 

# Drop rows with any empty cells
data.dropna(axis=0, how='any', thresh=None, subset=['T_degC','Salnty', 'STheta'], inplace=True)
#split data in to x and y for linear regression

#take density into affect
dens = data.values[:,8:9]

#salinity data couple with density (STheta)
x = data[['Salnty', 'STheta']]

#water temp data
y = data.values[:,5:6]


#split the test annd training sets
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size = .30)



In [None]:
print(xtest.values[:2])

In [None]:
# Model initialization
regression_model = LinearRegression()
# Fit the data(train the model)
regression_model.fit(xtrain, ytrain)

In [None]:
ypred = regression_model.predict(xtest)
print(ypred[1:5])
print(ytest[1:5])

In [None]:
#Now take in mean squared error using sklearn
rmse = mean_squared_error(ytest, ypred)
r2 = r2_score(ytest, ypred)*100

#print out result values before plotting
print('Slope: ', regression_model.coef_)
print('Intercept: ', regression_model.intercept_)
print('Mean square error of the data: ', rmse)
print('R2 percentage of the data: ', r2)


In [None]:
#Showing the negative relationship between salinity and temperature when we dont account for density

plt.scatter(xtrain,ytrain, s = .01)
plt.plot(range(28,38), regression_model.predict([[i] for i in range(28,38)]), color='y')
plt.xlabel('Salinity')
plt.ylabel('Temp. Deg Celsius')
plt.show()

In [None]:
#Set Up contour to compare water temp and salinity accounting for density values as well


#These results several results that I have seen in paper reflecting studies on salinity, water temp and density.
N = 100

x = np.linspace(data.values[:,6].min(), data.values[:,6].max(), N)
y = np.linspace(24, 30, N)

X, Y = np.meshgrid(x, y)

#Uses our model to get a z based on our x and y,
#y here is hardcoded in based on a density range 24-30 (view dataset to see full range of density values)
#used chopped up density here to make a more concise prediction of the model trend
z = regression_model.predict(np.vstack([X.ravel(), Y.ravel()]).transpose())


z = np.reshape(z, X.shape)

# Automatic selection of levels works; setting the
# log locator tells contourf to use a log scale:
fig, ax = plt.subplots()
cs = ax.contourf(X, Y, z)


cs = ax.contourf(X, Y, z)

cbar = fig.colorbar(cs)
#plt.scatter(xtrain, ytrain)
plt.show()

In [None]:
#checking length of p arrays based on values greater than zero
# and total size of the data set
#p = z.ravel()
#len(p[p > 0]),len(p)