## Final Project - Group 1 - AAI 530
### Zach Artman, Iman Hamdan, Diego Aceveda

In [1]:
import kagglehub
import pandas as pd

## Data Exploration

In [2]:
# Downloading the dataset
path = kagglehub.dataset_download("hemanthkarnati/indoor-air-quality-dataset")
dataset_path = f"{path}/indoor_data.csv"



In [3]:
# Read the dataset
df = pd.read_csv(dataset_path)

# Explore the first few entries of the dataset
df.head()

Unnamed: 0,created_at,entry_id,field1,field2,field3,field4,field5,field6,field7,latitude,longitude,elevation,status
0,2024-04-06 12:00:27+05:30,1256,163,42.0,33.8,49.0,520.0,18.0,21.0,,,,
1,2024-04-06 12:01:30+05:30,1257,162,37.0,33.8,49.0,637.0,36.0,9.0,,,,
2,2024-04-06 12:02:32+05:30,1258,173,47.0,33.7,50.0,679.0,42.0,3.0,,,,
3,2024-04-06 12:03:34+05:30,1259,168,37.0,33.3,51.0,539.0,21.0,0.0,,,,
4,2024-04-06 12:04:36+05:30,1260,168,37.0,33.3,52.0,697.0,45.0,0.0,,,,


In [4]:
# Explore the last few entries of the dataset
df.tail()

Unnamed: 0,created_at,entry_id,field1,field2,field3,field4,field5,field6,field7,latitude,longitude,elevation,status
1328,2024-04-07 11:54:43+05:30,2584,195,46.0,32.8,51.0,493.0,14.0,0.0,,,,
1329,2024-04-07 11:55:45+05:30,2585,198,40.0,32.8,50.0,539.0,21.0,0.0,,,,
1330,2024-04-07 11:56:47+05:30,2586,191,39.0,32.8,50.0,464.0,9.0,432.0,,,,
1331,2024-04-07 11:58:44+05:30,2587,191,43.0,32.8,49.0,454.0,8.0,0.0,,,,
1332,2024-04-07 11:59:46+05:30,2588,191,39.0,32.8,49.0,493.0,14.0,0.0,,,,


Dataset shows data roughly every minute from April 6, 2024 at 12:00 to April 7, 2024 at 11:59

## Preprocessing

In [5]:
# Drop the columns that have all NaN values
columns_to_drop = ['latitude', 'longitude', 'elevation', 'status']
df.drop(columns_to_drop, axis=1, inplace=True)

In [6]:
# Assert the columns were dropped
df.head()

Unnamed: 0,created_at,entry_id,field1,field2,field3,field4,field5,field6,field7
0,2024-04-06 12:00:27+05:30,1256,163,42.0,33.8,49.0,520.0,18.0,21.0
1,2024-04-06 12:01:30+05:30,1257,162,37.0,33.8,49.0,637.0,36.0,9.0
2,2024-04-06 12:02:32+05:30,1258,173,47.0,33.7,50.0,679.0,42.0,3.0
3,2024-04-06 12:03:34+05:30,1259,168,37.0,33.3,51.0,539.0,21.0,0.0
4,2024-04-06 12:04:36+05:30,1260,168,37.0,33.3,52.0,697.0,45.0,0.0


In [7]:
# Checking the datatypes
df.dtypes

created_at     object
entry_id        int64
field1          int64
field2        float64
field3        float64
field4        float64
field5        float64
field6        float64
field7        float64
dtype: object

Fixing the remaining datatypes

In [8]:
# Convert the created_at column to a datetime object
df['Datetime'] = pd.to_datetime(df['created_at'])
df.drop('created_at', axis=1, inplace=True)

In [9]:
df.head()

Unnamed: 0,entry_id,field1,field2,field3,field4,field5,field6,field7,Datetime
0,1256,163,42.0,33.8,49.0,520.0,18.0,21.0,2024-04-06 12:00:27+05:30
1,1257,162,37.0,33.8,49.0,637.0,36.0,9.0,2024-04-06 12:01:30+05:30
2,1258,173,47.0,33.7,50.0,679.0,42.0,3.0,2024-04-06 12:02:32+05:30
3,1259,168,37.0,33.3,51.0,539.0,21.0,0.0,2024-04-06 12:03:34+05:30
4,1260,168,37.0,33.3,52.0,697.0,45.0,0.0,2024-04-06 12:04:36+05:30


In [10]:
# Asserting that the created column is the proper type
df.dtypes

entry_id                        int64
field1                          int64
field2                        float64
field3                        float64
field4                        float64
field5                        float64
field6                        float64
field7                        float64
Datetime    datetime64[ns, UTC+05:30]
dtype: object

In [12]:
# Create a minute column defining how many minutes has passed
df['minute'] = pd.to_datetime(df['Datetime'])
df['minute'] = (df['minute'] - df['minute'][0]).dt.total_seconds() // 60

Checking the data

In [13]:
df.head()

Unnamed: 0,entry_id,field1,field2,field3,field4,field5,field6,field7,Datetime,minute
0,1256,163,42.0,33.8,49.0,520.0,18.0,21.0,2024-04-06 12:00:27+05:30,0.0
1,1257,162,37.0,33.8,49.0,637.0,36.0,9.0,2024-04-06 12:01:30+05:30,1.0
2,1258,173,47.0,33.7,50.0,679.0,42.0,3.0,2024-04-06 12:02:32+05:30,2.0
3,1259,168,37.0,33.3,51.0,539.0,21.0,0.0,2024-04-06 12:03:34+05:30,3.0
4,1260,168,37.0,33.3,52.0,697.0,45.0,0.0,2024-04-06 12:04:36+05:30,4.0


In [14]:
df.tail()

Unnamed: 0,entry_id,field1,field2,field3,field4,field5,field6,field7,Datetime,minute
1328,2584,195,46.0,32.8,51.0,493.0,14.0,0.0,2024-04-07 11:54:43+05:30,1434.0
1329,2585,198,40.0,32.8,50.0,539.0,21.0,0.0,2024-04-07 11:55:45+05:30,1435.0
1330,2586,191,39.0,32.8,50.0,464.0,9.0,432.0,2024-04-07 11:56:47+05:30,1436.0
1331,2587,191,43.0,32.8,49.0,454.0,8.0,0.0,2024-04-07 11:58:44+05:30,1438.0
1332,2588,191,39.0,32.8,49.0,493.0,14.0,0.0,2024-04-07 11:59:46+05:30,1439.0
