In this tutorial we will train and test a linear regressor. This is something you have already done in part 3 of the course document. So, I will miss out all of the extra notes and just get to business.

In [37]:
# needed to create the data frame
import pandas as pd

# create data frame from csv file we hosted on our github
df = pd.read_csv('https://raw.githubusercontent.com/uhi22018990/DataAnalytics/main/collision_data_clean.csv', index_col=0, )

In [38]:
# make sure we have our data by printing it out
print(df)

      num_collisions  dim_date_day_number  temperature  dew_point  \
1           0.152974                    7         83.6       63.0   
2           0.314585                    1         80.3       54.1   
3           0.936168                    2         79.8       56.7   
4          -0.505904                    3         81.8       65.6   
6           0.774556                    5         81.9       62.3   
...              ...                  ...          ...        ...   
3765       -1.419631                    5         53.8       34.3   
3766       -1.463141                    6         57.1       40.0   
3767       -1.935544                    7         57.5       46.7   
3768       -1.848523                    1         57.2       52.8   
3769       -1.985271                    2         64.0       62.2   

      sea_level_pressure  visibility  wind_speed  max_sustained_wind_speed  \
1                 1008.9         9.7         4.1                       9.9   
2              

## predictors

In [39]:
# needed to help with speedy maths based calculations
import numpy as np

# iloc allows us to select by rows. Here, we are shuffling the data by rows determined at random.
shuffle = df.iloc[np.random.permutation(len(df))]

# we are selecting all rows of the columns outlined excluding number of collisions
predictors = shuffle.iloc[:,1:11]


# print out the first 6 rows of predictors.
print(predictors[:6])

      dim_date_day_number  temperature  dew_point  sea_level_pressure  \
1394                    7         56.2       23.4              1016.4   
936                     4         34.4       19.0              1022.5   
3190                    4         55.6       51.6              1017.2   
1060                    2         76.1       57.5              1021.8   
2775                    2         50.1       39.1              1009.3   
2108                    7         38.6       13.9              1012.2   

      visibility  wind_speed  max_sustained_wind_speed  max_temperature  \
1394        10.0         6.2                       9.9             71.1   
936          8.3         3.4                       8.0             39.9   
3190         4.1         4.1                       8.0             68.0   
1060        10.0         4.5                       7.0             88.0   
2775         9.8         2.5                       7.0             60.1   
2108        10.0         7.4          

In [40]:
# print out the shuffled data (first 5 rows)
shuffle[:5]

Unnamed: 0,num_collisions,dim_date_day_number,temperature,dew_point,sea_level_pressure,visibility,wind_speed,max_sustained_wind_speed,max_temperature,min_temperature,total_precipitation
1394,-0.406451,7,56.2,23.4,1016.4,10.0,6.2,9.9,71.1,46.9,0.01
936,-0.033501,4,34.4,19.0,1022.5,8.3,3.4,8.0,39.9,25.0,0.0
3190,-1.593674,4,55.6,51.6,1017.2,4.1,4.1,8.0,68.0,46.9,1.27
1060,1.072916,2,76.1,57.5,1021.8,10.0,4.5,7.0,88.0,64.0,0.0
2775,-0.306998,2,50.1,39.1,1009.3,9.8,2.5,7.0,60.1,37.0,0.0


## targets

In [41]:
# Select all rows for the num_collisions column
targets = shuffle.iloc[:,0]

# print out the first 6 rows of the targets data.
print(targets)

1394   -0.406451
936    -0.033501
3190   -1.593674
1060    1.072916
2775   -0.306998
          ...   
2052   -0.213760
1931    0.898873
1214    0.731045
2206    0.314585
379    -0.263487
Name: num_collisions, Length: 3521, dtype: float64


In [42]:
# A scale is not required here, but the constant will be useful in the assignment.
SCALE_NUM_COLLISIONS = 1.0

In [43]:
# Split our data into a training set i.e. 80% of the length of the shuffle array
trainsize = int(len(shuffle['num_collisions'])*0.8)
# The test set size is 100% - 80% = 20% of the length of the shuffle array.
testsize = len(shuffle['num_collisions']) - trainsize

# Define the number of input values (predictors)
nppredictors = 10
# Define the number of output values (targets)
noutputs = 1

In [44]:
# import tensorflow
# %tensorflow_version 1.x
!pip install tensorflow==1.15.2
import tensorflow as tf

# check the version
print(tf.__version__)

# needed for high-level file management
import shutil  

# logging for tensorflow
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

# removes a saved model from the last training attempt.
shutil.rmtree('/tmp/linear_regression_trained_model', ignore_errors=True)

# This is the core of our linear regressor

# You can see that we save the model, use the the Adam optimization algorithm, which is an extension 
# to stochastic gradient descent that has recently seen broader adoption for deep learning applications 
# in computer vision and natural language processing and infer real valued columns from input which interprets 
# all inputs as dense, fixed-length float values.

# See the link for more information
# https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/contrib/learn/LinearRegressor
estimator = tf.contrib.learn.SKCompat(
    tf.contrib.learn.LinearRegressor(
        model_dir='/tmp/linear_regression_trained_model', 
        optimizer=tf.train.AdamOptimizer(learning_rate=0.1), 
        enable_centered_bias=False, 
        feature_columns=tf.contrib.learn.infer_real_valued_columns_from_input(predictors.values)))

# Prints a log to show model is starting to train
print("starting to train");

# Train the model. Pass in predictor values and target values.
estimator.fit(
    predictors[:trainsize].values, 
    targets[:trainsize].values.reshape(trainsize, noutputs)/SCALE_NUM_COLLISIONS, steps=10000)

# Next, we can check our predictions based on our predictors.
preds = estimator.predict(x=predictors[trainsize:].values)

# Apply the Scale value (not really needed here) to the outputs.
predslistscale = preds['scores']*SCALE_NUM_COLLISIONS

# pred = format(str(predslistscale)) # useful for checking outputs and printing.

# Calculate RMSE i.e. how good the model works using the predictions and targets.
# i.e. take the difference between the actual and the forecast then square the difference, 
# find the average of all the squares and then find the square root. 
# The RMSE essentially punishes larger errors i.e. it puts a heavier weight on larger errors.
rmse = np.sqrt(np.mean((targets[trainsize:].values - predslistscale)**2))
print('LinearRegression has RMSE of {0}'.format(rmse));


# Calculate the mean of the Number of collisions Values.
avg = np.mean(shuffle['num_collisions'][:trainsize])

# Calculate the RMSE using Number of collisions Values and the mean of all target values.
# The fit of a proposed regression model should therefore be better than the fit of the mean model.
# In this case, it doesn't seem to be the case but it will vary on every run.
rmse = np.sqrt(np.mean((shuffle['num_collisions'][trainsize:] - avg)**2))
print('Just using average = {0} has RMSE of {1}'.format(avg, rmse));

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc25d086510>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_train_distribute': None, '_eval_distribute': None, '_experimental_max_worker_delay_secs': None, '_device_fn': None, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_protocol': None, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': '/tmp/linear_regression_trained_model', '_session_creation_timeout_secs': 7200}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.


1.15.2
starting to train


INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/linear_regression_trained_model/model.ckpt.
INFO:tensorflow:loss = 1.182139, step = 1
INFO:tensorflow:global_step/sec: 1055.97
INFO:tensorflow:loss = 1.5067521, step = 101 (0.099 sec)
INFO:tensorflow:global_step/sec: 1168.88
INFO:tensorflow:loss = 1.1495833, step = 201 (0.086 sec)
INFO:tensorflow:global_step/sec: 1281.29
INFO:tensorflow:loss = 1.174067, step = 301 (0.075 sec)
INFO:tensorflow:global_step/sec: 1167.87
INFO:tensorflow:loss = 0.8451722, step = 401 (0.086 sec)
INFO:tensorflow:global_step/sec: 1177.6
INFO:tensorflow:loss = 1.0791321, step = 501 (0.087 sec)
INFO:tensorflow:global_step/sec: 1231.84
INFO:tensorflow:loss = 1.2086818, step = 601 (0.081 sec)
INFO:tensorflow:global_step/sec: 1180.09
INFO:tensorflow:loss = 1.3846271, step = 701 (0.086 sec)
INFO:tensorflow:global_step/sec: 1213.45
INFO:tensorflow:loss = 0.8897308, step = 801 (0.081 sec

LinearRegression has RMSE of 2.427208474235313
Just using average = -0.017264123717224234 has RMSE of 0.9978062966816492


In [68]:

input = pd.DataFrame.from_dict(data = {
    'dim_date_day_number' : [5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
    'temperature' : [47.3, 55.4, 36.3, 79.2, 77.5, 74.0, 70.9, 76.5, 80.0, 70.4],
    'dew_point'	 : [42.9, 44.4, 20.3, 57.7, 58.3, 62.4, 62.9, 64.5, 69.8, 49.5],
    'sea_level_pressure' 	 : [996.6, 1014.6, 1019.6, 1009.5, 1012.9, 1007.9, 1001.1, 1015.8, 1006.1, 1019.6],
    'visibility' : [7.1, 9.8, 10.0, 9.4, 10, 7.4, 7.9, 10, 9.8, 9],
    'wind_speed' : [9.4, 7.0, 4.2, 4.3, 4.2, 1.9, 4.6, 2.3, 2.5, 2],
    'max_sustained_wind_speed' : [32.1, 11.1, 7.0, 8.9, 8.0, 6.0, 8.9, 5.1, 6.0, 0.1],
    'max_temperature' : [55.9, 63.0, 43.0, 87.1, 82.9, 82.9, 78.1, 88.0, 86.0, 80.1],
    'min_temperature' : [39.9, 41.0, 25.0, 64.0, 71.1, 64.9, 64.9, 70.0, 66.0, 57.0],
    'total_precipitation' : [0.56, 0.01, 0.0, 0.0, 0.03, 0.68, 0.63, 0.0, 0.0, 0.0],
       })
					
# 2012-12-21 	5   751 	47.3 	42.9 	996.6 	7.1 	9.4 	32.1 	55.9 	39.0 	0.56
# 2013-12-21 	6   594 	55.4 	44.4 	1014.6 	9.8 	7.0 	11.1 	63.0 	41.0 	0.01	
# 2021-12-21 	2   256 	36.3 	20.3 	1019.6 	10.0 	4.2 	7 	  43.0 	25.0 	0.0
# 2016-06-21 	2   681 	79.2 	57.7 	1009.5 	9.4 	4.3 	8.9   87.1 	64.0  0.0
# 2017-06-21 	3   744 	77.5 	58.3 	1012.9 	10.0 	4.2 	8.0   82.9 	71.1  0.03
# 2018-06-21 	4   815 	74.0 	62.4 	1007.9 	7.4 	1.9 	6.0   82.9 	64.9  0.68
# 2019-06-21 	5   771 	70.9 	62.9 	1001.1 	7.9 	4.6 	8.9   78.1 	64.9  0.63
# 2020-06-21 	7   245 	76.5 	64.5 	1015.8 	10.0 	2.3 	5.1   88.0 	70.0  0.0
# 2021-06-21 	1   361 	80.0 	69.8 	1006.1 	9.8 	2.5 	6     86.0 	66.0  0.0
# 2022-06-21 	2   317 	70.4 	49.5 	1019.6 	9.0 	2 	  .1    80.1 	57.0  0.0

estimator = tf.contrib.learn.SKCompat(
    tf.contrib.learn.LinearRegressor(
        model_dir='/tmp/linear_regression_trained_model', 
        enable_centered_bias=False, 
        feature_columns=tf.contrib.learn.infer_real_valued_columns_from_input(input.values)))

preds = estimator.predict(x=input.values)
# calulate back the value based on sd 160.879646383154 and mean 513.4
predslistnorm = preds['scores']
predslistscale = (preds['scores']*160.879646383154) + 513.4
print(predslistnorm)
print(predslistscale)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc25d0555d0>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_train_distribute': None, '_eval_distribute': None, '_experimental_max_worker_delay_secs': None, '_device_fn': None, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_protocol': None, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': '/tmp/linear_regression_trained_model', '_session_creation_timeout_secs': 7200}
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/linear_regression_trained_model/model.ck

[-2.2950442 -2.3010023 -2.0943105 -2.2109106 -2.2619824 -2.206986
 -2.2905529 -2.378499  -2.3824246 -2.098521 ]
[144.1741  143.21558 176.46808 157.7095  149.49307 158.34088 144.89667
 130.74792 130.1164  175.79068]


let's try another day

In [69]:

input = pd.DataFrame.from_dict(data = {
    'dim_date_day_number' : [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    'temperature' : [47.3, 55.4, 36.3, 79.2, 77.5, 74.0, 70.9, 76.5, 80.0, 70.4],
    'dew_point'	 : [42.9, 44.4, 20.3, 57.7, 58.3, 62.4, 62.9, 64.5, 69.8, 49.5],
    'sea_level_pressure' 	 : [996.6, 1014.6, 1019.6, 1009.5, 1012.9, 1007.9, 1001.1, 1015.8, 1006.1, 1019.6],
    'visibility' : [7.1, 9.8, 10.0, 9.4, 10, 7.4, 7.9, 10, 9.8, 9],
    'wind_speed' : [9.4, 7.0, 4.2, 4.3, 4.2, 1.9, 4.6, 2.3, 2.5, 2],
    'max_sustained_wind_speed' : [32.1, 11.1, 7.0, 8.9, 8.0, 6.0, 8.9, 5.1, 6.0, 0.1],
    'max_temperature' : [55.9, 63.0, 43.0, 87.1, 82.9, 82.9, 78.1, 88.0, 86.0, 80.1],
    'min_temperature' : [39.9, 41.0, 25.0, 64.0, 71.1, 64.9, 64.9, 70.0, 66.0, 57.0],
    'total_precipitation' : [0.56, 0.01, 0.0, 0.0, 0.03, 0.68, 0.63, 0.0, 0.0, 0.0],
       })
					
# 2012-12-21 	5   751 	47.3 	42.9 	996.6 	7.1 	9.4 	42.9 	55.9 	39.0 	0.56
# 2013-12-21 	6   594 	55.4 	44.4 	1014.6 	9.8 	7.0 	11.1 	63.0 	41.0 	0.01	
# 2021-12-21 	2   256 	36.3 	20.3 	1019.6 	10.0 	4.2 	7 	  43.0 	25.0 	0.0
# 2016-06-21 	2   681 	79.2 	57.7 	1009.5 	9.4 	4.3 	8.9   87.1 	64.0  0.0
# 2017-06-21 	3   744 	77.5 	58.3 	1012.9 	10.0 	4.2 	8.0   82.9 	71.1  0.03
# 2018-06-21 	4   815 	74.0 	62.4 	1007.9 	7.4 	1.9 	6.0   82.9 	64.9  0.68
# 2019-06-21 	5   771 	70.9 	62.9 	1001.1 	7.9 	4.6 	8.9   78.1 	64.9  0.63
# 2020-06-21 	7   245 	76.5 	64.5 	1015.8 	10.0 	2.3 	5.1   88.0 	70.0  0.0
# 2021-06-21 	1   361 	80.0 	69.8 	1006.1 	9.8 	2.5 	6     86.0 	66.0  0.0
# 2022-06-21 	2   317 	70.4 	49.5 	1019.6 	9.0 	2 	  .1    80.1 	57.0  0.0

estimator = tf.contrib.learn.SKCompat(
    tf.contrib.learn.LinearRegressor(
        model_dir='/tmp/linear_regression_trained_model', 
        enable_centered_bias=False, 
        feature_columns=tf.contrib.learn.infer_real_valued_columns_from_input(input.values)))

preds = estimator.predict(x=input.values)
# calulate back the value based on sd 160.879646383154 and mean 513.4
predslistnorm = preds['scores']
predslistscale = (preds['scores']*160.879646383154) + 513.4
print(predslistnorm)
print(predslistscale)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc25cfc6d10>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_train_distribute': None, '_eval_distribute': None, '_experimental_max_worker_delay_secs': None, '_device_fn': None, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_log_step_count_steps': 100, '_protocol': None, '_session_config': None, '_save_checkpoints_steps': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': '/tmp/linear_regression_trained_model', '_session_creation_timeout_secs': 7200}
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/linear_regression_trained_model/model.ck

[-2.0702355 -2.0761936 -1.8695018 -1.986102  -2.0371737 -1.9821773
 -2.065744  -2.15369   -2.1576157 -1.8737124]
[180.34125 179.38272 212.63522 193.87662 185.66022 194.50803 181.06384
 166.9151  166.28357 211.95782]
