In [38]:
from pathlib import Path
import pandas as pd

In [39]:
# Define base directory
base_dir = Path('/Users/tabassum221b/Downloads/NetflixProject').resolve()

# Define specific folders
data_dir = base_dir / 'netflix-report'
notebook_dir = base_dir / 'notebook'

Viewing Activity Dataset Cleaning

In [40]:
viewing_activity = pd.read_csv(data_dir/'CONTENT_INTERACTION'/'ViewingActivity.csv')

In [41]:
viewing_activity.columns

Index(['Profile Name', 'Start Time', 'Duration', 'Attributes', 'Title',
       'Supplemental Video Type', 'Device Type', 'Bookmark', 'Latest Bookmark',
       'Country'],
      dtype='object')

The profile 'Oishi' is shared by Oishi and Fairuj

In [42]:
# Add a new column for User with the correct names
def assign_user(row):
    if row['Profile Name'] == 'Oishi' and row['Country'] == 'BD (Bangladesh)':
        return 'Fairuj'
    else:
        return row['Profile Name']

# Apply the function to create the new column
viewing_activity['User'] = viewing_activity.apply(assign_user, axis=1)

In [43]:
# Display the updated dataset with the new column
viewing_activity[viewing_activity['User'] == 'Fairuj'].head(2)

Unnamed: 0,Profile Name,Start Time,Duration,Attributes,Title,Supplemental Video Type,Device Type,Bookmark,Latest Bookmark,Country,User
1789,Oishi,2024-12-16 17:45:37,00:21:06,,Dark: Season 3: Life and Death (Episode 5),,Chrome PC (Cadmium),00:21:12,00:21:12,BD (Bangladesh),Fairuj
1790,Oishi,2024-12-16 16:47:47,00:39:28,,Dark: Season 3: The Origin (Episode 4),,Chrome PC (Cadmium),00:59:45,00:59:45,BD (Bangladesh),Fairuj


Modifying Duration Column

In [44]:
viewing_activity['Duration'].sample(1)

223    00:00:16
Name: Duration, dtype: object

In [45]:
# Convert 'Duration' to timedelta
viewing_activity['Duration (timedelta)'] = pd.to_timedelta(viewing_activity['Duration'])

# Create 'Duration (min)' and 'Duration (hr)' columns
viewing_activity['Duration (min)'] = (viewing_activity['Duration (timedelta)'].dt.total_seconds() / 60).round(2)
viewing_activity['Duration (hr)'] = (viewing_activity['Duration (timedelta)'].dt.total_seconds() / 3600).round(2)

# Drop the intermediate 'Duration (timedelta)' column if
viewing_activity = viewing_activity.drop(columns=['Duration (timedelta)'])


Different profiles are being used in different countries

In [46]:
viewing_activity['Country'].unique()

array(['IE (Ireland)', 'BD (Bangladesh)', 'US (United States)',
       'GB (United Kingdom)'], dtype=object)

In [47]:
# Define a mapping of countries to their UTC offsets (in hours)
country_to_utc_offset = {
    'US (United States)': -5,  # UTC-5
    'BD (Bangladesh)': 6,      # UTC+6
    'IE (Ireland)': 0,         # UTC+0
    'GB (United Kingdom)': 0   # UTC+0
}

# Function to manually adjust the time based on UTC offset
def manual_adjust_to_local_time(row):
    utc_time = row['Start Time']
    country = row['Country']
    
    # Get the UTC offset for the country, default to 0 if not in the dictionary
    utc_offset = country_to_utc_offset.get(country, 0)
    
    # Adjust the time by the UTC offset
    return utc_time + pd.Timedelta(hours=utc_offset)

In [48]:
# Apply the function to adjust 'Start Time'
viewing_activity['Start Time UTC'] = viewing_activity['Start Time']
viewing_activity['Start Time'] = pd.to_datetime(viewing_activity['Start Time'])  # Ensure 'Start Time' is datetime
viewing_activity['Start Time'] = viewing_activity.apply(manual_adjust_to_local_time, axis=1)

In [49]:
# Display a sample of the adjusted values
viewing_activity.head(2)

Unnamed: 0,Profile Name,Start Time,Duration,Attributes,Title,Supplemental Video Type,Device Type,Bookmark,Latest Bookmark,Country,User,Duration (min),Duration (hr),Start Time UTC
0,Choity,2024-12-19 04:51:24,00:17:04,,Happiness: Limited Series: Episode 12,,Apple iPhone 14 iPhone,01:11:18,01:11:18,IE (Ireland),Choity,17.07,0.28,2024-12-19 04:51:24
1,Choity,2024-12-19 04:24:49,00:24:34,,Happiness: Limited Series: Epsiode 11 (Episode...,,Apple iPhone 14 iPhone,01:00:34,01:00:34,IE (Ireland),Choity,24.57,0.41,2024-12-19 04:24:49


New Columns

In [50]:
# Create new columns from 'Start Time'
viewing_activity['Year'] = viewing_activity['Start Time'].dt.year
viewing_activity['Month'] = viewing_activity['Start Time'].dt.month
viewing_activity['Date_of_month'] = viewing_activity['Start Time'].dt.day
viewing_activity['day_of_week'] = viewing_activity['Start Time'].dt.dayofweek
viewing_activity['day_name'] = viewing_activity['Start Time'].dt.day_name()

In [51]:
# Split 'Title' into 'TV Show', 'Season', and 'Episode'
viewing_activity[['TV Show', 'Season', 'Episode']] = viewing_activity['Title'].str.split(':', expand=True, n=2)

# Determine Content Type (Movie or TV Show)
viewing_activity['Content Type'] = viewing_activity['Season'].apply(lambda x: 'Movie' if pd.isna(x) else 'TV Show')

In [52]:
#Check update
viewing_activity[['Title', 'TV Show', 'Season', 'Episode', 'Content Type']].sample(2)

Unnamed: 0,Title,TV Show,Season,Episode,Content Type
657,Where Stars Land: Season 1_hook_03_16x9,Where Stars Land,Season 1_hook_03_16x9,,TV Show
1833,Gilmore Girls: Season 1 - CLM 2,Gilmore Girls,Season 1 - CLM 2,,TV Show


In [53]:
viewing_activity['Supplemental Video Type'].unique()

array([nan, 'HOOK', 'TRAILER', 'RECAP', 'TEASER_TRAILER', 'CINEMAGRAPH'],
      dtype=object)

In [54]:
# Keep only rows where 'Supplemental Video Type' is NaN
viewing_activity = viewing_activity[viewing_activity['Supplemental Video Type'].isna()]

In [55]:
# Drop the specified columns
viewing_activity = viewing_activity.drop(columns=['Attributes', 'Supplemental Video Type', 'Bookmark', 'Latest Bookmark'])

Modifying Start time column

In [56]:
# Ensure 'Start Time' is in datetime format
viewing_activity['Start Time'] = pd.to_datetime(viewing_activity['Start Time'], errors='coerce')

# Create 'Date' column by extracting the date part
viewing_activity['Date'] = viewing_activity['Start Time'].dt.date

# Update 'Start Time' column by extracting the time part
viewing_activity['Start Time'] = viewing_activity['Start Time'].dt.time

In [57]:
#final look
viewing_activity.head()

Unnamed: 0,Profile Name,Start Time,Duration,Title,Device Type,Country,User,Duration (min),Duration (hr),Start Time UTC,Year,Month,Date_of_month,day_of_week,day_name,TV Show,Season,Episode,Content Type,Date
0,Choity,04:51:24,00:17:04,Happiness: Limited Series: Episode 12,Apple iPhone 14 iPhone,IE (Ireland),Choity,17.07,0.28,2024-12-19 04:51:24,2024,12,19,3,Thursday,Happiness,Limited Series,Episode 12,TV Show,2024-12-19
1,Choity,04:24:49,00:24:34,Happiness: Limited Series: Epsiode 11 (Episode...,Apple iPhone 14 iPhone,IE (Ireland),Choity,24.57,0.41,2024-12-19 04:24:49,2024,12,19,3,Thursday,Happiness,Limited Series,Epsiode 11 (Episode 11),TV Show,2024-12-19
2,Choity,04:12:48,00:10:02,Happiness: Limited Series: Episode 10,Apple iPhone 14 iPhone,IE (Ireland),Choity,10.03,0.17,2024-12-19 04:12:48,2024,12,19,3,Thursday,Happiness,Limited Series,Episode 10,TV Show,2024-12-19
3,Choity,03:54:23,00:16:11,Happiness: Limited Series: Episode 9,Apple iPhone 14 iPhone,IE (Ireland),Choity,16.18,0.27,2024-12-19 03:54:23,2024,12,19,3,Thursday,Happiness,Limited Series,Episode 9,TV Show,2024-12-19
4,Choity,03:27:39,00:24:47,Happiness: Limited Series: Episode 8,Apple iPhone 14 iPhone,IE (Ireland),Choity,24.78,0.41,2024-12-19 03:27:39,2024,12,19,3,Thursday,Happiness,Limited Series,Episode 8,TV Show,2024-12-19


In [58]:
# Define the output path for the updated dataset
output_path = data_dir / 'CONTENT_INTERACTION' / 'ViewingActivity_Updated.csv'

# Save the updated dataset
viewing_activity.to_csv(output_path, index=False)

print(f"Updated dataset has been saved to: {output_path}")

Updated dataset has been saved to: /Users/tabassum221b/Downloads/NetflixProject/netflix-report/CONTENT_INTERACTION/ViewingActivity_Updated.csv
