Start of google colab specific stuff

In [69]:
# from google.colab import files  

In [70]:
# files.upload() # (to upload kaggle.json)

In [71]:
# !mkdir ~/.kaggle # (makes hiddn folder in root directory of linux instance alloted in colab)

In [72]:
# !mv {path of kaggle.json} ~/.kaggle # (moving kaggle.json to that hidden folder in linux)

In [73]:
# !chmod 600 ~/.kaggle/kaggle.json # (updating the permission to read, write only and not executable)

In [74]:
# !pip install kaggle

In [75]:
# !kaggle datasets download -d sohier/calcofi # (to download dataset)

In [76]:
# !unzip {path of downloaded zip}

End of google colab specific stuff <hr>

Getting the data

In [77]:
import pandas as pd
import numpy as np

In [78]:
data = pd.read_csv("bottle.csv")

  data = pd.read_csv("bottle.csv")


In [79]:
data.head()

Unnamed: 0,Cst_Cnt,Btl_Cnt,Sta_ID,Depth_ID,Depthm,T_degC,Salnty,O2ml_L,STheta,O2Sat,...,R_PHAEO,R_PRES,R_SAMP,DIC1,DIC2,TA1,TA2,pH2,pH1,DIC Quality Comment
0,1,1,054.0 056.0,19-4903CR-HY-060-0930-05400560-0000A-3,0,10.5,33.44,,25.649,,...,,0,,,,,,,,
1,1,2,054.0 056.0,19-4903CR-HY-060-0930-05400560-0008A-3,8,10.46,33.44,,25.656,,...,,8,,,,,,,,
2,1,3,054.0 056.0,19-4903CR-HY-060-0930-05400560-0010A-7,10,10.46,33.437,,25.654,,...,,10,,,,,,,,
3,1,4,054.0 056.0,19-4903CR-HY-060-0930-05400560-0019A-3,19,10.45,33.42,,25.643,,...,,19,,,,,,,,
4,1,5,054.0 056.0,19-4903CR-HY-060-0930-05400560-0020A-7,20,10.45,33.421,,25.643,,...,,20,,,,,,,,


Problem statement: Is there a relationship between water salinity & water temperature? Can you predict the water temperature based on salinity?

<br>Observations:
- output (temperature) has range (-inf,+inf)
- We keep those features in our dataset which are highly correlated (pearson correlation) to salinity. We will drop the rest features
- $\hat{y}$ : T_degC
- 4 features (counts and IDS) given in starting are useless too.
- There are lot of missing values also. Some columns (features) seems to have lot of missing values which makes them useless. Drop them

<hr> Start of Preprocessing task

In [80]:
reduced_data = data.iloc[:,4:] # dropped first 4 columns (counts abd IDs)

In [81]:
reduced_data.isna().sum()

Depthm                      0
T_degC                  10963
Salnty                  47354
O2ml_L                 168662
STheta                  52689
                        ...  
TA1                    862779
TA2                    864629
pH2                    864853
pH1                    864779
DIC Quality Comment    864808
Length: 70, dtype: int64

In [82]:
# drop those columns which does not have atleast 80% non NA values
# or we can say columns with more than 20% of missing values are dropped
reduced_data_copy = reduced_data.dropna(thresh=(0.8*reduced_data.shape[0]),axis=1)

In [83]:
reduced_data_copy.shape

(864863, 22)

Now, we will drop those rows in which output feature (T_degC) values are missing <br>

Using boolean mask, we will be able to determine all the row indices where the values of column ```T_degC``` are missing. you will get ```np.array()``` of 10963 row indices.


In [84]:
# drop
reduced_data_copy = reduced_data_copy[~reduced_data_copy['T_degC'].isna()]

In [85]:
reduced_data_copy.isna().sum()

Depthm             0
T_degC             0
Salnty         39653
O2ml_L        162613
STheta         41726
RecInd             0
T_prec             0
S_prec         39653
NH3q           56564
C14A1q         14641
C14A2q         14643
DarkAq         21754
MeanAq         21755
R_Depth            0
R_TEMP             0
R_POTEMP       35084
R_SALINITY     39653
R_SIGMA        41893
R_SVA          41808
R_DYNHT        39264
R_O2          162613
R_PRES             0
dtype: int64

imputation of missing values using sklearn

In [86]:
from sklearn.impute import SimpleImputer

In [87]:
imputer = SimpleImputer(strategy='mean')

# Fit and transform the data
df = imputer.fit_transform(reduced_data_copy)

# Convert the NumPy array back to a DataFrame
df = pd.DataFrame(df, columns=reduced_data_copy.columns)

In [88]:
df.isna().sum()

Depthm        0
T_degC        0
Salnty        0
O2ml_L        0
STheta        0
RecInd        0
T_prec        0
S_prec        0
NH3q          0
C14A1q        0
C14A2q        0
DarkAq        0
MeanAq        0
R_Depth       0
R_TEMP        0
R_POTEMP      0
R_SALINITY    0
R_SIGMA       0
R_SVA         0
R_DYNHT       0
R_O2          0
R_PRES        0
dtype: int64

In [89]:
df.shape

(853900, 22)

In [90]:
Y_train = np.array(df["T_degC"])

In [91]:
X_train = np.array(df.drop(labels=['T_degC'],axis=1))

In [92]:
X_train.shape

(853900, 21)

End of preprocessing task <hr>

Creating Shallow network with two hidden layers


In [93]:
import tensorflow as tf
from keras.layers import Input # for input layer
from keras.layers import Dense # for hidden layer + output layer
from keras.models import Sequential # SISO , no skipping of any layer for data flow

In [94]:
def create_dnn(): # densely connected network
  dnn = Sequential() # made an object of Sequential class, data get processed sequentially
  dnn.add(Input(shape=(21,))) # input layer added
  dnn.add(Dense(units=21,activation="relu")) # 4 neurons hidden layer
  dnn.add(Dense(units=21,activation="relu")) # another hidden layer
  dnn.add(Dense(units=1)) # output layer # default is "Linear Activation Function"
  return dnn

In [95]:
dnn = create_dnn()

In [96]:
dnn # our model

<keras.src.engine.sequential.Sequential at 0x1687e43a0>

In [97]:
dnn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 21)                462       
                                                                 
 dense_7 (Dense)             (None, 21)                462       
                                                                 
 dense_8 (Dense)             (None, 1)                 22        
                                                                 
Total params: 946 (3.70 KB)
Trainable params: 946 (3.70 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [98]:
dnn.layers

[<keras.src.layers.core.dense.Dense at 0x2ad8e0d90>,
 <keras.src.layers.core.dense.Dense at 0x1687e4130>,
 <keras.src.layers.core.dense.Dense at 0x2cfa055b0>]

In [99]:
# dnn.layers[0].trainable = False # attribute modified, made the parameters of first layer non-trainable

In [100]:
dnn.compile(optimizer="sgd", loss="mse",metrics=[tf.keras.metrics.RootMeanSquaredError()]) # setting the environment to train our model.
# loss function, training algorithm (optimizer), error metric
# default optimizer is rmsprop

There are 3 ways of passing data to our NN model to fit it:
1. Using custom training data generator
2. Using Keras training data generator
3. Directly passing the data. (X_train, Y_train) <br>

Direclty passing the data puts the all data in RAM at once. RAM will be fully occupied at once in case of large dataset. That's why generator is used which gives data to NN in batches to process.

In [101]:
dnn.fit(x=X_train, y=Y_train,
        batch_size=200, 
        validation_split=0.2,
        epochs=25
        ) # method 3 of fitting

Epoch 1/25
   1/3416 [..............................] - ETA: 16:02 - loss: 2257.5735 - root_mean_squared_error: 47.5139

2023-06-03 21:26:31.512586: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-06-03 21:26:48.420412: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/25
Epoch 3/25
Epoch 4/25

KeyboardInterrupt: 

In [None]:
mini_batch_size=80

In [None]:
validation_size = df.shape[0]-df.shape[0]*0.20
validation_size

683120.0

In [None]:
def training_data_generator():
    while True: # to repeat the yield
        for i in range(int(df.shape[0]-df.shape[0]*0.20)//mini_batch_size): # 20% data kept for validation
            yield X_train[i*mini_batch_size:(i+1)*mini_batch_size,:], Y_train[i*mini_batch_size:(i+1)*mini_batch_size]

In [None]:
datagen = training_data_generator()

In [None]:
datagen  # (output of lazy function)

<generator object training_data_generator at 0x2b7bbe040>

In [None]:
datagen.__next__()[1].shape

(80,)

## .fit parameters guide:
Value of **batch size** should divide the size of training data. <br> 
When the number of training examples (𝑁) is not a multiple of the batch size (𝐵), Keras creates an additional batch for the remaining data. In other words, you end up having  (𝑁 𝑑𝑖𝑣 𝐵)+1 batches, and the last batch has  (𝑁 𝑚𝑜𝑑 𝐵) training examples. <br> <br>
**verbosity**: while training we will be shown some activities which indicattes that our model is getting trained <br> <br>
**callbacks**: functions which supervises the training. Take some actions when see some chaos (like overfitting) during training. These callbacks can be custom or provided by in built by Keras <br> <br>
**validation_data**: pass tuple of features amd output directly or a generator of those tuples <br> <br>
**validation_split**: This is used when you passed whole data for training. This value specify what percentage of data to be taken for validation <br> <br>
**shuffle**: shuffles the order of training data points after each epoch <br> <br>

In [None]:
validation_data = (X_train[int(df.shape[0]-df.shape[0]*0.20):], Y_train[int(df.shape[0]-df.shape[0]*0.20):])

In [None]:
dnn.fit(training_data_generator(),
        epochs=15,
        steps_per_epoch=int(df.shape[0]-df.shape[0]*0.20)//mini_batch_size,
        validation_data=validation_data
        ) 
# method 1

2023-05-26 23:10:32.480065: I tensorflow/core/common_runtime/executor.cc:1210] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


ValueError: Tensor conversion requested dtype int32 for Tensor with dtype float32: <tf.Tensor: shape=(), dtype=float32, numpy=0.0>