<a href="https://www.kaggle.com/code/faiqueali/in-progress-data-preprocessing?scriptVersionId=142899870" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Imports

In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")
%matplotlib inline

# Constants

In [None]:
DATASET_PATH = '/kaggle/input/movies-dataset-for-feature-extracion-prediction/movies.csv'
REGEX_MOVIE_YEAR = r'\(\d{4}\)|\([MDCLXVI]+(?:M{0,3}D?C{0,3}L?X{0,3}V?I{0,3})\)\(\d{4}\)|\([MDCLXVI]+(?:M{0,3}D?C{0,3}L?X{0,3}V?I{0,3})\)'

# Helpers

In [None]:
def get_genre_count(dataframe):
    genre_count = {}

    # Split the 'GENRE' column by ',' and stack the resulting Series
    genre_series = dataframe.str.split(', ').explode()

    # Use value_counts to get the genre counts and convert it to a dictionary
    genre_count = genre_series.value_counts().to_dict()
    
    return genre_count

# Gather Data

In [None]:
# Load the dataset
data = pd.read_csv(DATASET_PATH)

# Show first 10 values
data.head(10)

In [None]:
data.shape

#### Check for null values in dataset

In [None]:
data.info()

In [None]:
data.isna().sum().to_frame('Null Values Count')

#### Check for duplicates

In [None]:
data.duplicated().sum()

In [None]:
# Dropping the duplication
data.drop_duplicates(inplace=True)
data.shape

In [None]:
data.isna().sum().to_frame('Null Values Count')

#### Renaming columns name

In [None]:
data.rename(columns={'RunTime': 'RUNTIME', 'Gross': 'GROSS'}, inplace=True)

# Cleaning Data

In [None]:
# Remove 'ONE-LINE' as it the description for the movie
# Remove 'Gross' due to its significance absence
remove_columns = ['ONE-LINE', 'GROSS']
data = data.drop(columns=remove_columns, axis=1)
data.head(5)

In [None]:
# Remove rows where YEAR, GENRE, RATING, VOTES, RUNTIME AND STARS is NaN
df_removed_na_attributes_movies = data[
    (data['YEAR'].isna()) & \
    (data['RATING'].isna()) & \
    (data['VOTES'].isna()) & \
    (data['RUNTIME'].isna())
]
df_removed_na_attributes_movies.shape

In [None]:
df_removed_na_attributes_movies.head()

In [None]:
# (~) bitwise NOT of NAN entries
data =  data[~
    ((data['YEAR'].isna()) & \
    (data['RATING'].isna()) & \
    (data['VOTES'].isna()) & \
    (data['RUNTIME'].isna()))
]

data.shape

#### Fill Missing values and change column data types

In [None]:
# [FOR 'GENRE'] Remove '\n' and empty spaces
data['GENRE'] = data['GENRE'].str.replace('\n', '').str.strip()
data['GENRE'].fillna('Unknown', inplace=True)

In [None]:
# [FOR 'VOTES'] Remove ',' and NAN
data['VOTES'] = data['VOTES'].str.replace(',', '', regex=True)
data['VOTES'].fillna('0', inplace=True)
data['VOTES'] = data['VOTES'].astype(int)

In [None]:
# [FOR 'RUNTIME'] fill missing values
data['RUNTIME'].fillna(0.0, inplace=True)

In [None]:
# [FOR 'RATING'] fill missing values
data['RATING'].fillna(0.0, inplace=True)

In [None]:
# [FOR 'YEAR'] fill missing values
data['YEAR'].fillna('Unknown', inplace=True)

In [None]:
data.isna().sum().to_frame('Null Values Count')

## TODO:
* Fill missing values with mean with attribute type float/int
* Group by movie w.r.t name
* Split STARS into DIRECTOR and CAST column
* Find no. of years for series type
* Maintain directors count and cast count in a dict

# Data Transformation

#### Add conditional column to distinguish between Movie and Series

In [None]:
# Create a conditional column "Type"
data['Type'] = data['YEAR'].apply(lambda x: 'Movie' if pd.notna(x) and pd.Series(x).str.match(REGEX_MOVIE_YEAR).any() else 'Series')
data.head(5)

In [None]:
genre_count = get_genre_count(data['GENRE'])
genre_count

In [None]:
# Check unique values
data.nunique()

### Check insights, correlation and covariance

In [None]:
df_insights = data[['RATING', 'VOTES', 'RUNTIME']]
df_insights.describe()

In [None]:
df_insights.corr()

In [None]:
df_insights.cov()

# Data Visualization

In [None]:
# [HEAT MAP] Plotting Correlation between the variables
# Setting plot size
plt.figure(figsize=(5,5))
sns.heatmap(df_insights.corr(), annot=True, cbar=True, annot_kws={'size': 12}, cmap="Blues")
plt.show

In [None]:
sns.pairplot(data, hue='Type')