In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [5]:
df = pd.read_csv("IMDb Movies India.csv", encoding='latin-1')

In [6]:
df

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
...,...,...,...,...,...,...,...,...,...,...
15504,Zulm Ko Jala Doonga,(1988),,Action,4.6,11,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Zulmi Raj,(2005),,Action,,,Kiran Thej,Sangeeta Tiwari,,
15507,Zulmi Shikari,(1988),,Action,,,,,,


In [8]:
df.shape

(15509, 10)

In [9]:
df.dtypes

Name         object
Year         object
Duration     object
Genre        object
Rating      float64
Votes        object
Director     object
Actor 1      object
Actor 2      object
Actor 3      object
dtype: object

In [10]:
df.describe()

Unnamed: 0,Rating
count,7919.0
mean,5.841621
std,1.381777
min,1.1
25%,4.9
50%,6.0
75%,6.8
max,10.0


In [11]:
df.isna().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [13]:
df.duplicated().sum()

6

In [14]:
df.dropna(inplace=True)

In [15]:
df.shape

(5659, 10)

In [16]:
df.isna().sum()

Name        0
Year        0
Duration    0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64

In [17]:
df.columns

Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3'],
      dtype='object')

In [18]:
df['Year'] = df['Year'].str.replace(r'[()]','',regex=True).astype(int)

In [19]:
df['Duration'] = pd.to_numeric(df['Duration'].str.replace(' min',''))

In [20]:
df['Genre'] = df['Genre'].str.split(',')
df = df.explode('Genre')
df['Genre'].fillna(df['Genre'].mode()[0],inplace=True)

In [21]:
df['Votes'] = pd.to_numeric(df['Votes'].str.replace(',',''))

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11979 entries, 1 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      11979 non-null  object 
 1   Year      11979 non-null  int64  
 2   Duration  11979 non-null  int64  
 3   Genre     11979 non-null  object 
 4   Rating    11979 non-null  float64
 5   Votes     11979 non-null  int64  
 6   Director  11979 non-null  object 
 7   Actor 1   11979 non-null  object 
 8   Actor 2   11979 non-null  object 
 9   Actor 3   11979 non-null  object 
dtypes: float64(1), int64(3), object(6)
memory usage: 1.0+ MB


In [23]:
import plotly.express as px

year = px.histogram(df, x='Year', histnorm='probability density', nbins=30)
year.show()

In [24]:
avg_rating_by_year = df.groupby(['Year','Genre'])['Rating'].mean().reset_index()

top_genres = df['Genre'].value_counts().head(10).index

average_rating_by_year = avg_rating_by_year[avg_rating_by_year['Genre'].isin(top_genres)]

fig = px.line(avg_rating_by_year, x='Year', y='Rating', color='Genre')

fig.update_layout(title='Average Rating by Year and Genre',xaxis_title ='Year',yaxis_title = 'Average Rating')

fig.show()

In [26]:
rating_fig = px.histogram(df, x = 'Rating',histnorm = 'probability density',nbins =40)
rating_fig.update_layout(title = 'Distribution of Ratings',title_x = 0.5,title_pad = dict(t=20),title_font=dict(size=20),xaxis_title='Rating',yaxis_title ='Probability density')
rating_fig.show()

In [27]:
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score,mean_absolute_error,mean_squared_error,r2_score

In [28]:
df.drop('Name',axis=1,inplace=True)

In [30]:
genre_mean_rating = df.groupby('Genre')['Rating'].transform('mean')
df['Genre_Mean_Rating'] = genre_mean_rating

director_mean_rating = df.groupby('Director')['Rating'].transform('mean')
df['Director_encoded'] = director_mean_rating

actor1_mean_rating = df.groupby('Actor 1')['Rating'].transform('mean')
df['Actor1_encoded'] = actor1_mean_rating

actor2_mean_rating = df.groupby('Actor 2')['Rating'].transform('mean')
df['Actor2_encoded'] = actor2_mean_rating

actor3_mean_rating = df.groupby('Actor 3')['Rating'].transform('mean')
df['Actor3_encoded'] = actor3_mean_rating

In [31]:
x = df[['Year','Votes','Duration','Genre_Mean_Rating','Director_encoded','Actor1_encoded','Actor2_encoded','Actor3_encoded']]
y = df['Rating']

In [32]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size =0.2,random_state = 42)

In [34]:
ML = LinearRegression()
ML.fit(x_train,y_train)
ML_predict = ML.predict(x_test)

In [38]:
print('The Perfomance evaluvation of Logistic Regression is below: ', '\n')
print('Mean Squared Error:',mean_squared_error(y_test,ML_predict))
print('Mean absolute error :',mean_absolute_error(y_test,ML_predict))
print('R2 score : ',r2_score(y_test , ML_predict))

The Perfomance evaluvation of Logistic Regression is below:  

Mean Squared Error: 0.4463977880886115
Mean absolute error : 0.4921055068501125
R2 score :  0.7641906900948995


In [39]:
x.head()

Unnamed: 0,Year,Votes,Duration,Genre_Mean_Rating,Director_encoded,Actor1_encoded,Actor2_encoded,Actor3_encoded
1,2019,8,109,6.248697,7.0,6.85,7.0,7.0
3,2019,35,110,5.838423,4.4,5.25,4.4,4.46
3,2019,35,110,5.838739,4.4,5.25,4.4,4.46
5,1997,827,147,5.838423,5.335135,4.793617,5.73,5.93
5,1997,827,147,5.875793,5.335135,4.793617,5.73,5.93


In [40]:
y.head()

1    7.0
3    4.4
3    4.4
5    4.7
5    4.7
Name: Rating, dtype: float64

In [41]:
data = {'Year':[2019],'Votes':[36],'Duration':[111],'Genre_mean_rating':[5.8],'Director_encoded':[4.5],'Actor1_encoded':[5.3],'Actor2_encoded':[4.5],'Actor3_encoded':[4.5]}
trail = pd.DataFrame(data)

In [47]:
trail = trail.rename(columns={'Genre_mean_rating': 'Genre_Mean_Rating'})

rating_predicted = ML.predict(trail)
print('Predicted Rating :',rating_predicted[0])

Predicted Rating : 4.207758858584072
