## Preprocessing 

In [46]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import re
from collections import Counter
import pycountry_convert as pc
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA
import numpy as np

In [51]:
df = pd.read_csv('preprocessed_imdb.csv')

### Countries Feature

For dealing with countries feature we one hot them by their continent.  

In [52]:
def country_to_continent(country_name):
    if country_name == 'United States':
        country_name = "United States of America"

    if country_name == 'West Germany':
        country_name = "Germany"

    if country_name == 'Netherlands Antilles':
        country_name = "Venezuela"
    
    if country_name == 'Soviet Union':
        country_name = "Russia"
    try:
        country_alpha2 = pc.country_name_to_country_alpha2(country_name)
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
        return country_continent_name
    
    except LookupError:
        print(f"Country: {country_name} Not found")
        return "Not found"

In [53]:
df['Country_list'] = df['Country of Origin'].str.split('~')
df['Continents'] = df['Country_list'].apply(lambda countries: [country_to_continent(country) for country in countries])


all_continents = set(continent for sublist in df['Continents'] for continent in sublist)
for continent in all_continents:
    df[continent] = df['Continents'].apply(lambda x: int(continent in x))


df.drop(df[df['Not found'] == 1].index, inplace=True)
df = df.drop(columns=['Continents', 'Not found'])


df.describe()

Country: Serbia and Montenegro Not found
Country: Federal Republic of Yugoslavia Not found
Country: Yugoslavia Not found


Unnamed: 0,Release Year,Duration,Rating,Number of Votes,Meta Score,Budget,Gross (worldwide),profit,Europe,Asia,Africa,South America,Oceania,North America
count,4720.0,4720.0,4720.0,4720.0,4720.0,4720.0,4720.0,4720.0,4720.0,4720.0,4720.0,4720.0,4720.0,4720.0
mean,2003.642797,111.307415,6.660763,173165.0,57.971822,41.279362,128.291388,87.012026,0.342797,0.122669,0.008686,0.00572,0.047034,0.913983
std,13.421396,20.282864,0.930735,232034.1,17.926348,47.459251,209.48558,178.956505,0.474694,0.328092,0.092805,0.075424,0.211734,0.280419
min,1950.0,63.0,1.5,26000.0,1.0,0.00156,0.000126,-199.821857,0.0,0.0,0.0,0.0,0.0,0.0
25%,1997.0,97.0,6.1,50000.0,45.0,10.0,19.059963,0.883066,0.0,0.0,0.0,0.0,0.0,1.0
50%,2006.0,108.0,6.7,95000.0,58.0,25.0,55.418696,28.148098,0.0,0.0,0.0,0.0,0.0,1.0
75%,2013.0,121.0,7.3,196000.0,71.0,54.0,147.969526,97.493462,1.0,0.0,0.0,0.0,0.0,1.0
max,2024.0,321.0,9.3,2900000.0,100.0,356.0,2923.706026,2686.706026,1.0,1.0,1.0,1.0,1.0,1.0


### Genere one hot encoding

In [54]:
df['Genre_list'] = df['Genre'].str.split('~')

all_genres = set(genre for sublist in df['Genre_list'] for genre in sublist)
print("total number of generes: ", len(all_genres))

for genre in all_genres:
    df[genre] = df['Genre_list'].apply(lambda x: int(genre in x))

df = df.drop(columns=['Genre_list'])

total number of generes:  22


### companies

In [55]:
all_companies = df['Companies'].str.split('~').explode()
company_counts = Counter(all_companies)
top_companies = company_counts.most_common(25)
top_df = pd.DataFrame(top_companies, columns=['Company', 'Frequency'])
print(top_df)
print(top_df['Frequency'].sum() / df.shape[0])

                      Company  Frequency
0                Warner Bros.        372
1          Universal Pictures        362
2           Columbia Pictures        315
3          Paramount Pictures        309
4       Twentieth Century Fox        220
5        Walt Disney Pictures        176
6             New Line Cinema        171
7   Metro-Goldwyn-Mayer (MGM)        112
8            Relativity Media        110
9         Dreamworks Pictures        107
10    New Regency Productions        105
11        Touchstone Pictures        105
12            Lionsgate Films         87
13  Village Roadshow Pictures         84
14                    Miramax         72
15       Summit Entertainment         70
16       Amblin Entertainment         59
17                StudioCanal         56
18                Screen Gems         55
19          Fox 2000 Pictures         54
20             Focus Features         54
21    Legendary Entertainment         53
22           TriStar Pictures         53
23      Imagine 

In [56]:
df.describe()

Unnamed: 0,Release Year,Duration,Rating,Number of Votes,Meta Score,Budget,Gross (worldwide),profit,Europe,Asia,...,Horror,Music,Adventure,Crime,History,Fantasy,Animation,Sci-Fi,Western,Sport
count,4720.0,4720.0,4720.0,4720.0,4720.0,4720.0,4720.0,4720.0,4720.0,4720.0,...,4720.0,4720.0,4720.0,4720.0,4720.0,4720.0,4720.0,4720.0,4720.0,4720.0
mean,2003.642797,111.307415,6.660763,173165.0,57.971822,41.279362,128.291388,87.012026,0.342797,0.122669,...,0.127119,0.026271,0.236653,0.200847,0.028602,0.091737,0.057415,0.097034,0.007203,0.02161
std,13.421396,20.282864,0.930735,232034.1,17.926348,47.459251,209.48558,178.956505,0.474694,0.328092,...,0.333141,0.159958,0.425072,0.400677,0.166702,0.288685,0.232659,0.296035,0.084576,0.145422
min,1950.0,63.0,1.5,26000.0,1.0,0.00156,0.000126,-199.821857,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1997.0,97.0,6.1,50000.0,45.0,10.0,19.059963,0.883066,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2006.0,108.0,6.7,95000.0,58.0,25.0,55.418696,28.148098,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2013.0,121.0,7.3,196000.0,71.0,54.0,147.969526,97.493462,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2024.0,321.0,9.3,2900000.0,100.0,356.0,2923.706026,2686.706026,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Feature Engineering

### Feature Scaling

In [58]:
features_to_scale = ['Duration', 'Number of Votes', 'Meta Score', 'Budget', 'Gross (worldwide)']
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[features_to_scale])
scaled_df = pd.DataFrame(scaled_features, columns=features_to_scale)
df[features_to_scale] = scaled_df
df = df.dropna()
df.to_csv('scaled_data.csv', index=False)

## Dimensionality Reduction

In [59]:
# Separate features and target variable
features = df.drop(columns=['profit', 'Rating'])
targets = df[['profit', 'Rating']]

### Use the PCA method to reduce the dimensions of numerical features to two dimensions

In [60]:
numerical_features = features.select_dtypes(include=[np.number])
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(numerical_features)



### How much of the initial data variance is transferred to the new space?

In [61]:
reduced_df = pd.DataFrame(reduced_data, columns=['PC1', 'PC2'])
variance_ratio = pca.explained_variance_ratio_
transferred_variance = variance_ratio.sum()
print(f"Transferred variance to 2D space: {transferred_variance * 100:.2f}%")

Transferred variance to 2D space: 97.33%


### If we aim to retain 95% of the original variance, what is the minimum number of dimensions required in the new space?

In [62]:
pca_95 = PCA(0.95)
pca_95.fit(numerical_features)
n_components_95 = pca_95.n_components_
print(f"Number of components to retain 95% variance: {n_components_95}")

#Save both the original data and the dimension-reduced one for the next parts
df.to_csv('original_data.csv', index=False)
reduced_df_with_targets = pd.concat([reduced_df, targets.reset_index(drop=True)], axis=1)
reduced_df_with_targets.to_csv('reduced_data.csv', index=False)

Number of components to retain 95% variance: 1
