In [0]:
import pandas as pd
df=pd.read_csv("/Volumes/workspace/default/netflixdata/netflix_titles_cleaned.csv")
# Drop columns
df = df.drop(columns=["rating 66min", "rating 74 min", "rating 84min"], errors="ignore")

# Confirm
print(df.columns)


Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')


In [0]:
for col in df.columns:
    print(col)


show_id
type
title
director
cast
country
date_added
release_year
rating
duration
listed_in
description


In [0]:
# Country frequency encoding
df['country_encoded'] = df['country'].map(df['country'].value_counts())

# Genres frequency encoding
df['Genres_encoded'] = df['listed_in'].map(df['listed_in'].value_counts())

print(df[['country', 'country_encoded']].head(10), "\n")

print(df[['listed_in', 'Genres_encoded']].head(10), "\n")



                                             country  country_encoded
0                                      United States           2818.0
1                                       South Africa             30.0
2                                                NaN              NaN
3                                                NaN              NaN
4                                              India            972.0
5                                                NaN              NaN
6                                                NaN              NaN
7  United States, Ghana, Burkina Faso, United Kin...              1.0
8                                     United Kingdom            419.0
9                                      United States           2818.0 

                                           listed_in  Genres_encoded
0                                      Documentaries             359
1    International TV Shows, TV Dramas, TV Mysteries              26
2  Crime TV Shows, In

In [0]:
# Release Year ordinal encoding
year_mapping = {year: idx for idx, year in enumerate(sorted(df['release_year'].dropna().unique()))}
df['release_year_encoded'] = df['release_year'].map(year_mapping)
print("Release Year Ordinal Encoding:")
print(df[['release_year', 'release_year_encoded']].head(10), "\n")

# Rating ordinal encoding (custom order)
rating_order = ['G','PG','PG-13','R','NC-17','TV-Y','TV-Y7','TV-G','TV-PG','TV-14','TV-MA','Unrated']
rating_mapping = {rating: idx for idx, rating in enumerate(rating_order)}
df['rating_encoded'] = df['rating'].replace(rating_mapping)
print("Rating Ordinal Encoding:")
print(df[['rating', 'rating_encoded']].head(10), "\n")


Release Year Ordinal Encoding:
   release_year  release_year_encoded
0          2020                    72
1          2021                    73
2          2021                    73
3          2021                    73
4          2021                    73
5          2021                    73
6          2021                    73
7          1993                    45
8          2021                    73
9          2021                    73 

Rating Ordinal Encoding:
  rating rating_encoded
0  PG-13              2
1  TV-MA             10
2  TV-MA             10
3  TV-MA             10
4  TV-MA             10
5  TV-MA             10
6     PG              1
7  TV-MA             10
8  TV-14              9
9  PG-13              2 



In [0]:
df = pd.get_dummies(df, columns=['type'], prefix='type')
print("type One-Hot Encoding:")
print(df[['type_Movie', 'type_TV Show']].head(10), "\n")


type One-Hot Encoding:
   type_Movie  type_TV Show
0           1             0
1           0             1
2           0             1
3           0             1
4           0             1
5           0             1
6           1             0
7           1             0
8           0             1
9           1             0 



In [0]:
# Extract number from Duration
df['duration_num'] = df['duration'].str.extract(r'(\d+)').astype(float)

# Optional: separate by unit
df['duration_unit'] = df['duration'].str.extract(r'([A-Za-z]+)')
print("duration Numeric Conversion:")
print(df[['duration', 'duration_num', 'duration_unit']].head(10), "\n")


duration Numeric Conversion:
    duration  duration_num duration_unit
0     90 min          90.0           min
1  2 Seasons           2.0       Seasons
2   1 Season           1.0        Season
3   1 Season           1.0        Season
4  2 Seasons           2.0       Seasons
5   1 Season           1.0        Season
6     91 min          91.0           min
7    125 min         125.0           min
8  9 Seasons           9.0       Seasons
9    104 min         104.0           min 



In [0]:
# Extract numeric duration first
df['duration_num'] = df['duration'].str.extract(r'(\d+)').astype(float)

# Min-Max normalization
df['duration_normalized'] = (df['duration_num'] - df['duration_num'].min()) / (df['duration_num'].max() - df['duration_num'].min())

# View results
print(df[['duration', 'duration_num', 'duration_normalized']].head(10))


    duration  duration_num  duration_normalized
0     90 min          90.0             0.286174
1  2 Seasons           2.0             0.003215
2   1 Season           1.0             0.000000
3   1 Season           1.0             0.000000
4  2 Seasons           2.0             0.003215
5   1 Season           1.0             0.000000
6     91 min          91.0             0.289389
7    125 min         125.0             0.398714
8  9 Seasons           9.0             0.025723
9    104 min         104.0             0.331190


In [0]:
print(df)

     show_id                  title  ... duration_unit duration_normalized
0         s1   Dick Johnson Is Dead  ...           min            0.286174
1         s2          Blood & Water  ...       Seasons            0.003215
2         s3              Ganglands  ...        Season            0.000000
3         s4  Jailbirds New Orleans  ...        Season            0.000000
4         s5           Kota Factory  ...       Seasons            0.003215
...      ...                    ...  ...           ...                 ...
8802   s8803                 Zodiac  ...           min            0.504823
8803   s8804            Zombie Dumb  ...       Seasons            0.003215
8804   s8805             Zombieland  ...           min            0.279743
8805   s8806                   Zoom  ...           min            0.279743
8806   s8807                 Zubaan  ...           min            0.353698

[8807 rows x 20 columns]
