# Project 1: Creating an ML model to predict the daily high temperature in Champaign, IL

---

### This project will utilize "KCMI.csv", a dataset of daily weather statistics at Champaign's Willard Aiport from August 1, 2005 to August 1, 2025. The meaning of each column header, can be found here: https://www.ncei.noaa.gov/data/daily-summaries/doc/GHCND_documentation.pdf

In [67]:
# Here, I import the necessary libraries for data analysis and cleaning 
# and read-in the first five rows of data to get a feel for what I am looking at.

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

station = pd.read_csv("KCMI.csv")
pd.set_option("display.max_columns", None)
station.head()

Unnamed: 0,STATION,NAME,DATE,AWND,FMTM,PGTM,PRCP,PSUN,SNOW,SNWD,TAVG,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,WT01,WT02,WT03,WT04,WT05,WT06,WT08,WT09,WT10
0,USW00094870,"CHAMPAIGN URBANA WILLARD AIRPORT, IL US",2005-08-01,3.8,1627.0,,0.0,,,,,89.0,62.0,110.0,110.0,8.9,12.1,,,,,,,,,
1,USW00094870,"CHAMPAIGN URBANA WILLARD AIRPORT, IL US",2005-08-02,4.92,1617.0,,0.0,,,,,90.0,65.0,140.0,160.0,8.9,12.1,,,,,,,,,
2,USW00094870,"CHAMPAIGN URBANA WILLARD AIRPORT, IL US",2005-08-03,5.82,1236.0,,0.0,,,,,91.0,66.0,200.0,210.0,14.1,17.0,,,,,,,,,
3,USW00094870,"CHAMPAIGN URBANA WILLARD AIRPORT, IL US",2005-08-04,6.71,1029.0,,0.0,,,,,86.0,71.0,220.0,210.0,14.1,17.0,,,,,,,,,
4,USW00094870,"CHAMPAIGN URBANA WILLARD AIRPORT, IL US",2005-08-05,5.59,201.0,,0.03,,,,,82.0,67.0,360.0,10.0,13.0,17.0,,,,,,,,,


In [68]:
# Now that I have a feel for what the dataset looks like, I want to 
# determine which columns are necessary to ultimately predict Champaign's 
# daily high temperature. I will create a list of columns and evaluate from there.

station.columns

Index(['STATION', 'NAME', 'DATE', 'AWND', 'FMTM', 'PGTM', 'PRCP', 'PSUN',
       'SNOW', 'SNWD', 'TAVG', 'TMAX', 'TMIN', 'WDF2', 'WDF5', 'WSF2', 'WSF5',
       'WT01', 'WT02', 'WT03', 'WT04', 'WT05', 'WT06', 'WT08', 'WT09', 'WT10'],
      dtype='object')

In [69]:
# Station and name are irrelevant because we know that those will remain
# constant throughout the entire dataset. As long as all other columns have
# numeric data, I am going to keep them.

station = station[['DATE', 'AWND', 'FMTM', 'PGTM', 'PRCP', 'PSUN',
       'SNOW', 'SNWD', 'TAVG', 'TMAX', 'TMIN', 'WDF2', 'WDF5', 'WSF2', 'WSF5',
       'WT01', 'WT02', 'WT03', 'WT04', 'WT05', 'WT06', 'WT08', 'WT09', 'WT10']]

station.dtypes

DATE     object
AWND    float64
FMTM    float64
PGTM    float64
PRCP    float64
PSUN    float64
SNOW    float64
SNWD    float64
TAVG    float64
TMAX    float64
TMIN    float64
WDF2    float64
WDF5    float64
WSF2    float64
WSF5    float64
WT01    float64
WT02    float64
WT03    float64
WT04    float64
WT05    float64
WT06    float64
WT08    float64
WT09    float64
WT10    float64
dtype: object

In [70]:
# Now that I know that all data is numeric, I can move on to analysis.
# I'll re-print the dataset that I have so far.

station

Unnamed: 0,DATE,AWND,FMTM,PGTM,PRCP,PSUN,SNOW,SNWD,TAVG,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,WT01,WT02,WT03,WT04,WT05,WT06,WT08,WT09,WT10
0,2005-08-01,3.80,1627.0,,0.00,,,,,89.0,62.0,110.0,110.0,8.9,12.1,,,,,,,,,
1,2005-08-02,4.92,1617.0,,0.00,,,,,90.0,65.0,140.0,160.0,8.9,12.1,,,,,,,,,
2,2005-08-03,5.82,1236.0,,0.00,,,,,91.0,66.0,200.0,210.0,14.1,17.0,,,,,,,,,
3,2005-08-04,6.71,1029.0,,0.00,,,,,86.0,71.0,220.0,210.0,14.1,17.0,,,,,,,,,
4,2005-08-05,5.59,201.0,,0.03,,,,,82.0,67.0,360.0,10.0,13.0,17.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7299,2025-07-28,5.59,,,0.28,,,,,90.0,74.0,330.0,340.0,21.0,30.0,1.0,,1.0,,,,,,
7300,2025-07-29,5.37,,,0.00,,,,,92.0,73.0,120.0,340.0,16.1,27.1,1.0,,,,,,1.0,,
7301,2025-07-30,4.03,,,0.13,,,,,87.0,70.0,280.0,300.0,18.1,32.0,1.0,1.0,1.0,,,,1.0,,
7302,2025-07-31,10.74,,,0.02,,,,,77.0,65.0,10.0,30.0,18.1,29.1,1.0,,,,,,1.0,,


In [71]:
# My next job is to clean up the dataset. I need to determine which columns
# have too many NaN entries to be useful to my model.

station.isna().sum()

DATE       0
AWND      19
FMTM    7151
PGTM    5215
PRCP      39
PSUN    7303
SNOW    7167
SNWD    7120
TAVG    7304
TMAX      10
TMIN      18
WDF2      14
WDF5     106
WSF2      13
WSF5      89
WT01    3397
WT02    6713
WT03    6375
WT04    7298
WT05    7292
WT06    7240
WT08    5908
WT09    7284
WT10    7303
dtype: int64

In [72]:
# PSUN, TAVG, and WT10 do not have a single data entry, so those will be removed. 
# Those are easy fixes, so I'll update the dataset and continue to evaluate other columns.

station = station[['DATE', 'AWND', 'FMTM', 'PGTM', 'PRCP',
       'SNOW', 'SNWD', 'TMAX', 'TMIN', 'WDF2', 'WDF5', 'WSF2', 'WSF5',
       'WT01', 'WT02', 'WT03', 'WT04', 'WT05', 'WT06', 'WT08', 'WT09']]

station.isna().sum()

DATE       0
AWND      19
FMTM    7151
PGTM    5215
PRCP      39
SNOW    7167
SNWD    7120
TMAX      10
TMIN      18
WDF2      14
WDF5     106
WSF2      13
WSF5      89
WT01    3397
WT02    6713
WT03    6375
WT04    7298
WT05    7292
WT06    7240
WT08    5908
WT09    7284
dtype: int64

In [73]:
# The next easiest task is to clean up the WT** columns. These columns represent
# types of weather at the station. These columns are binary, meaning that if the 
# weather condition was present, the column is given a 1; otherwise, it is filled with NaN.
# These can be filled with zeroes to indicate that the condition was not observed.

cols = ['WT01', 'WT02', 'WT03', 'WT04', 'WT05', 'WT06', 'WT08', 'WT09']
station.loc[:, cols] = station.loc[:, cols].fillna(0)

In [74]:
station.isna().sum()

DATE       0
AWND      19
FMTM    7151
PGTM    5215
PRCP      39
SNOW    7167
SNWD    7120
TMAX      10
TMIN      18
WDF2      14
WDF5     106
WSF2      13
WSF5      89
WT01       0
WT02       0
WT03       0
WT04       0
WT05       0
WT06       0
WT08       0
WT09       0
dtype: int64