# Setting Up the Environment

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Get Data from the dataset

In [2]:
data = pd.read_csv("Movies_data.csv")

In [3]:
#check the data
data

Unnamed: 0.1,Unnamed: 0,id,title,original_language,release_date,vote_average,popularity,adult
0,0,238,The Godfather,en,1972-03-14,8.7,114.574,False
1,1,278,The Shawshank Redemption,en,1994-09-23,8.7,91.998,False
2,2,240,The Godfather Part II,en,1974-12-20,8.6,61.490,False
3,3,19404,Dilwale Dulhania Le Jayenge,hi,1995-10-20,8.6,28.989,False
4,4,424,Schindler's List,en,1993-12-15,8.6,43.764,False
...,...,...,...,...,...,...,...,...
19995,19995,384521,The Cloverfield Paradox,en,2018-02-04,5.6,17.073,False
19996,19996,14361,Captain Ron,en,1992-09-18,5.6,11.060,False
19997,19997,11637,Are We There Yet?,en,2005-01-20,5.6,15.591,False
19998,19998,10956,Joe Dirt,en,2001-04-10,5.6,15.664,False


In [4]:
#check data if there is any part missing
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         20000 non-null  int64  
 1   id                 20000 non-null  int64  
 2   title              20000 non-null  object 
 3   original_language  20000 non-null  object 
 4   release_date       20000 non-null  object 
 5   vote_average       20000 non-null  float64
 6   popularity         20000 non-null  float64
 7   adult              20000 non-null  bool   
dtypes: bool(1), float64(2), int64(2), object(3)
memory usage: 1.1+ MB


# Seperate the Data into X and Y
## X = independents Y = dependent

In [5]:
from sklearn.model_selection import train_test_split

#define what x and y is
X = data.drop(['vote_average'], axis=1) #x is the table without vote_average
Y = data['vote_average'] #y is the table only with vote_average

In [6]:
X

Unnamed: 0.1,Unnamed: 0,id,title,original_language,release_date,popularity,adult
0,0,238,The Godfather,en,1972-03-14,114.574,False
1,1,278,The Shawshank Redemption,en,1994-09-23,91.998,False
2,2,240,The Godfather Part II,en,1974-12-20,61.490,False
3,3,19404,Dilwale Dulhania Le Jayenge,hi,1995-10-20,28.989,False
4,4,424,Schindler's List,en,1993-12-15,43.764,False
...,...,...,...,...,...,...,...
19995,19995,384521,The Cloverfield Paradox,en,2018-02-04,17.073,False
19996,19996,14361,Captain Ron,en,1992-09-18,11.060,False
19997,19997,11637,Are We There Yet?,en,2005-01-20,15.591,False
19998,19998,10956,Joe Dirt,en,2001-04-10,15.664,False


In [7]:
Y

0        8.7
1        8.7
2        8.6
3        8.6
4        8.6
        ... 
19995    5.6
19996    5.6
19997    5.6
19998    5.6
19999    5.6
Name: vote_average, Length: 20000, dtype: float64

# Seperate into train dataset and test dataset

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [9]:
#combine the data to traindata set
train_data = X_train.join(y_train)

In [10]:
train_data

Unnamed: 0.1,Unnamed: 0,id,title,original_language,release_date,popularity,adult,vote_average
17604,17604,65650,The Good Doctor,en,2011-04-23,20.434,False,6.2
7562,7562,38167,Eat Pray Love,en,2010-08-12,17.710,False,6.2
19099,19099,34494,Sorry if I Love You,it,2008-01-25,9.383,False,5.9
8137,8137,21349,Anywhere but Here,en,1999-11-12,14.514,False,6.1
2421,2421,985939,Fall,en,2022-08-11,196.537,False,7.3
...,...,...,...,...,...,...,...,...
8100,8100,18360,Night at the Museum: Battle of the Smithsonian,en,2009-05-20,31.663,False,6.1
12081,12081,396263,Gantz:O,ja,2016-10-14,1098.178,False,7.3
704,704,169813,Short Term 12,en,2013-08-23,12.599,False,7.8
9573,9573,46261,Don't Be Afraid of the Dark,en,2010-11-06,20.281,False,5.8


In [11]:
train_data.original_language.value_counts()

original_language
en    12131
fr     1126
it      674
ja      512
es      359
de      180
ko      161
cn      113
zh      112
hi       88
ru       88
pt       70
sv       65
da       56
no       46
pl       36
nl       26
tr       19
th       19
hu       14
fa       13
fi       12
id        9
cs        8
el        7
te        6
is        6
ar        6
uk        5
ro        4
eu        4
sr        3
xx        3
hy        2
tn        2
km        2
et        2
bn        2
nb        2
he        2
ml        1
gl        1
bs        1
sh        1
la        1
Name: count, dtype: int64

### Since the languages are randomed from data, so I can not just use the dommie value to respresent
### Also the release date can not be read by model, tranform it to some value that model can read

In [13]:
train_data['release_date'] = pd.to_datetime(train_data['release_date'])
train_data['Year'] = train_data['release_date'].dt.year
train_data['Month'] = train_data['release_date'].dt.month
train_data['Day'] = train_data['release_date'].dt.day

In [14]:
train_data

Unnamed: 0.1,Unnamed: 0,id,title,original_language,release_date,popularity,adult,vote_average,Year,Month,Day
17604,17604,65650,The Good Doctor,en,2011-04-23,20.434,False,6.2,2011,4,23
7562,7562,38167,Eat Pray Love,en,2010-08-12,17.710,False,6.2,2010,8,12
19099,19099,34494,Sorry if I Love You,it,2008-01-25,9.383,False,5.9,2008,1,25
8137,8137,21349,Anywhere but Here,en,1999-11-12,14.514,False,6.1,1999,11,12
2421,2421,985939,Fall,en,2022-08-11,196.537,False,7.3,2022,8,11
...,...,...,...,...,...,...,...,...,...,...,...
8100,8100,18360,Night at the Museum: Battle of the Smithsonian,en,2009-05-20,31.663,False,6.1,2009,5,20
12081,12081,396263,Gantz:O,ja,2016-10-14,1098.178,False,7.3,2016,10,14
704,704,169813,Short Term 12,en,2013-08-23,12.599,False,7.8,2013,8,23
9573,9573,46261,Don't Be Afraid of the Dark,en,2010-11-06,20.281,False,5.8,2010,11,6
