In [None]:
!pip install pandas
!pip install numpy 
!pip install matplotlib
!pip install seaborn



In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("Top_1000_IMDb_movies_New_version.csv", index_col=0 )

## Fixing column types and dropping Nan Values 

In [None]:
df['Votes'] = df['Votes'].str.replace(',', '').astype(float)
df['Votes'] = df['Votes'].astype(int)


In [None]:
convert = ['Year of Release', 'Gross']
df[convert] = df[convert].apply(pd.to_numeric, errors='coerce')

In [None]:
#drop entrees with Nan or Null values
df = df.dropna()
df.info()

## Creating a correlogram of the different variables in dataset

In [None]:
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()

In [None]:
x = df['Year of Release']
y = df['Metascore of movie']

df_avg_metascore = df.groupby('Year of Release')['Metascore of movie'].mean().reset_index()

plt.figure(figsize=(12, 6))

min_metascore_year = df_avg_metascore.loc[df_avg_metascore['Metascore of movie'].idxmin()]

plt.annotate(
    f"Worse Performing Year {int(min_metascore_year['Year of Release'])}\n"
    f"Avg Metascore: {min_metascore_year['Metascore of movie']:.2f}",
    xy=(min_metascore_year['Year of Release'], min_metascore_year['Metascore of movie']),
    xytext=(min_metascore_year['Year of Release']+5, min_metascore_year['Metascore of movie']),  
    arrowprops=dict(facecolor='black', shrink=0.05),  
)

plt.plot(df_avg_metascore['Year of Release'], df_avg_metascore['Metascore of movie'])
sns.regplot(x=x, y=y, color='red', scatter=False)
plt.title('Time Series of Year of Release vs Average Metascore')
plt.xlabel('Year of Release')
plt.ylabel('Average Metascore')

plt.show()
plt.savefig('time_series.svg')

In [None]:
numerical_columns = ['Year of Release', 'Watch Time', 'Movie Rating', 'Metascore of movie', 'Gross', 'Votes']

plt.figure(figsize=(20, 6))

# Create subplots for each numerical column
for i, col in enumerate(numerical_columns):
    plt.subplot(1, len(numerical_columns), i + 1)
    sns.histplot(df[col], kde=True, color='green') 
    plt.grid(True, linestyle='none', alpha=0.6)
    plt.xlabel(col)
    plt.ylabel('Frequency') 

plt.suptitle('Numerical Column Distributions', y=1.05)  

plt.tight_layout() 
plt.show()
plt.savefig('numerical_distributions.svg')