# Preliminary Data Exploration
### Activities Dataset

##### 1. Verify the Structure

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os

In [2]:
activities = pd.read_csv("../../data/raw/activities/activities.csv")

activities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   date            18 non-null     object
 1   activity (Y/N)  18 non-null     object
dtypes: object(2)
memory usage: 420.0+ bytes


In [3]:
activities.head()

Unnamed: 0,date,activity (Y/N)
0,2024-12-21,No
1,2024-12-22,No
2,2024-12-23,Yes
3,2024-12-24,No
4,2024-12-25,No


##### 2. Identify Missing Values

In [5]:
activities.isnull().sum()

date              0
activity (Y/N)    0
dtype: int64

**Remove the dates after 2025-06-01 from the analysis**

##### 3. Correct Formats and Data Types

In [7]:
activities['date'] = pd.to_datetime(activities['date'])

# remove all dates after 2025-06-01
date_limit = "2025-01-06"

filtered_activities = activities[activities['date'] <= pd.to_datetime(date_limit)]

print(filtered_activities)

         date activity (Y/N)
0  2024-12-21             No
1  2024-12-22             No
2  2024-12-23            Yes
3  2024-12-24             No
4  2024-12-25             No
5  2024-12-26            Yes
6  2024-12-27            Yes
7  2024-12-28             No
8  2024-12-29            Yes
9  2024-12-30            Yes
10 2024-12-31             No
11 2025-01-01             No
12 2025-01-02            Yes
13 2025-01-03            Yes
14 2025-01-04             No
15 2025-01-05            Yes
16 2025-01-06            Yes


In [11]:
# Verify the column "activity (Y/N)"
unique_values = filtered_activities['activity (Y/N)'].unique()
print(unique_values)

['No' 'Yes']


##### 4. Identify Missing Dates

In [12]:
start_date = "2024-12-21"
end_date = "2025-01-06"

full_date_range = pd.date_range(start=start_date, end=end_date, freq="D")
print(full_date_range)

DatetimeIndex(['2024-12-21', '2024-12-22', '2024-12-23', '2024-12-24',
               '2024-12-25', '2024-12-26', '2024-12-27', '2024-12-28',
               '2024-12-29', '2024-12-30', '2024-12-31', '2025-01-01',
               '2025-01-02', '2025-01-03', '2025-01-04', '2025-01-05',
               '2025-01-06'],
              dtype='datetime64[ns]', freq='D')


In [14]:
filtered_dataset_dates = set(filtered_activities['date'])
missing_dates = set(full_date_range) - filtered_dataset_dates
print(f"Missing dates: {missing_dates}")
print(f"How many missing dates? {len(missing_dates)}")

Missing dates: set()
How many missing dates? 0


##### 5. Check for Duplicate Data

In [17]:
date_counts = filtered_activities['date'].value_counts()
duplicate_data = date_counts[date_counts > 1]
print("Duplicate Data Detected:")
print(len(duplicate_data))

Duplicate Data Detected:
0


##### Cleaning Process Completed: save data as cleaned

In [16]:
output_dir = "../../data/cleaned/activities"
output_file = os.path.join(output_dir, "activities.csv")

os.makedirs(output_dir, exist_ok=True)

filtered_activities.to_csv(output_file, index=False)

print(f"File saved in: {output_file}")

File saved in: ../../data/cleaned/activities\activities.csv
