In [12]:
# Import necessary packages
import pandas as pd,numpy as np, pickle
from sklearn import preprocessing

# Set the max display rows to 500
pd.set_option('display.max_rows', 500)

# Check pandas and numpy version
print('Pandas version:',pd.__version__)
print('Numpy version:',np.__version__)

Pandas version: 1.3.4
Numpy version: 1.21.2


In [2]:
movie_df = pd.read_csv('imdb - Copy.csv')
movie_df.head()

Unnamed: 0,Name,Date,Rate,Votes,Genre,Duration,Type,Certificate,Episodes,Nudity,Violence,Profanity,Alcohol,Frightening
0,No Time to Die,2021,7.6,107163,"Action, Adventure, Thriller",163,Film,PG-13,-,Mild,Moderate,Mild,Mild,Moderate
1,The Guilty,2021,6.3,64375,"Crime, Drama, Thriller",90,Film,R,-,,,Severe,,Moderate
2,The Many Saints of Newark,2021,6.4,27145,"Crime, Drama",120,Film,R,-,Moderate,Severe,Severe,Moderate,Moderate
3,Venom: Let There Be Carnage,2021,6.4,30443,"Action, Adventure, Sci-Fi",97,Film,PG-13,-,,Moderate,Moderate,Mild,Moderate
4,Dune,2021,8.3,84636,"Action, Adventure, Drama",155,Film,PG-13,-,,Moderate,,Mild,Moderate


In [3]:
# To make sense with the data in the 'Date' column, 
# we change the name of that column to 'Year'.
movie_df = movie_df.rename(columns={'Date':'Year'})

# Check the data frame info
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6178 entries, 0 to 6177
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Name         6178 non-null   object
 1   Year         6178 non-null   int64 
 2   Rate         6178 non-null   object
 3   Votes        6178 non-null   object
 4   Genre        6178 non-null   object
 5   Duration     6178 non-null   object
 6   Type         6178 non-null   object
 7   Certificate  6178 non-null   object
 8   Episodes     6178 non-null   object
 9   Nudity       6178 non-null   object
 10  Violence     6178 non-null   object
 11  Profanity    6178 non-null   object
 12  Alcohol      6178 non-null   object
 13  Frightening  6178 non-null   object
dtypes: int64(1), object(13)
memory usage: 675.8+ KB


In [4]:
# Check every rows for duplicate values. Have 'True' if have duplicates
movie_df.duplicated().unique()

array([False,  True])

In [5]:
new_df = movie_df.drop_duplicates()
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5028 entries, 0 to 5027
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Name         5028 non-null   object
 1   Year         5028 non-null   int64 
 2   Rate         5028 non-null   object
 3   Votes        5028 non-null   object
 4   Genre        5028 non-null   object
 5   Duration     5028 non-null   object
 6   Type         5028 non-null   object
 7   Certificate  5028 non-null   object
 8   Episodes     5028 non-null   object
 9   Nudity       5028 non-null   object
 10  Violence     5028 non-null   object
 11  Profanity    5028 non-null   object
 12  Alcohol      5028 non-null   object
 13  Frightening  5028 non-null   object
dtypes: int64(1), object(13)
memory usage: 589.2+ KB


In [6]:
category_cols = ['Name','Genre',\
    'Type','Certificate','Nudity',\
    'Violence','Profanity','Alcohol','Frightening']
numerical_cols = ['Year','Rate','Votes',\
                'Duration','Episodes']

In [7]:
# Loop through category columns
# to see unique value in each columns.
for column in category_cols:
    uniqueValues = new_df[column].unique()
    print('Unique elements in column \'' + column + '\'')
    print(uniqueValues)
    print()

# Question need to ask: Should we ignore the column Genre

Unique elements in column 'Name'
['No Time to Die' 'The Guilty' 'The Many Saints of Newark' ...
 "Before the Devil Knows You're Dead" 'Queen Bees' 'Death Race']

Unique elements in column 'Genre'
['Action, Adventure, Thriller' 'Crime, Drama, Thriller' 'Crime, Drama'
 'Action, Adventure, Sci-Fi' 'Action, Adventure, Drama'
 'Comedy, Drama, Sport' 'Action, Adventure, Comedy'
 'Animation, Action, Adventure' 'Comedy, Drama' 'Drama, Horror, Thriller'
 'Crime, Drama, Mystery' 'Drama, Sci-Fi' 'Comedy, Crime, Drama' 'Comedy'
 'Drama, Romance' 'Action, Drama, History' 'Action, Fantasy, Mystery'
 'Action, Adventure, Fantasy' 'Drama' 'Action, Horror, Mystery'
 'Crime, Drama, Fantasy' 'Horror, Thriller' 'Horror, Mystery, Thriller'
 'Crime, Horror, Mystery' 'Drama, Mystery, Thriller'
 'Action, Crime, Drama' 'Biography, Drama, Musical'
 'Drama, Mystery, Sci-Fi' 'Action, Drama, Sci-Fi'
 'Animation, Adventure, Comedy' 'Comedy, Fantasy, Horror'
 'Drama, Horror, Sci-Fi' 'Biography, Crime, Drama' 'Crime'


In [8]:
# Loop through category columns
# to see unique value in each columns.
for column in numerical_cols:
    uniqueValues = new_df[column].unique()
    print('Unique elements in column \'' + column + '\'')
    print(uniqueValues)
    print()

Unique elements in column 'Year'
[2021 2015 2006 2020 2018 2012 1993 1984 2008 1978 2019 1997 2022 1995
 1996 2017 1999 2013 1994 2001 2007 1991 2016 2000 1972 1988 2014 1975
 2005 2009 1981 2002 2010 2004 1980 1962 1982 1992 1998 1969 1990 1985
 2003 1973 1971 1974 1987 2011 1979 1964 1986 1963 1989 1976 1983 1960
 1977 1957 1966 1968 1965 1953 1967 1950 1939 1948 1942 1961 1941 1954
 2023 1931 1958 1946 1959 1956 1945 1935 1932 1927 1951 1922 1937 1952
 1970 1940 1955 1949 1933 1943 1944]

Unique elements in column 'Rate'
['7.6' '6.3' '6.4' '8.3' '8.8' '7.3' 'No Rate' '8.2' '7.9' '7.4' '9.2'
 '8.0' '7.5' '6.8' '7.7' '8.4' '8.1' '6.1' '4.8' '4.1' '6.5' '8.7' '5.2'
 '6.7' '7.8' '5.3' '8.5' '7.2' '9.4' '5.8' '6.9' '8.9' '6.6' '8.6' '5.5'
 '7.0' '6.2' '5.0' '9.3' '6.0' '5.9' '9.0' '3.3' '7.1' '4.2' '5.4' '4.9'
 '5.1' '9.1' '3.9' '4.6' '5.7' '4.3' '5.6' '4.4' '3.2' '2.7' '4.7' '3.7'
 '4.0' '3.4' '9.6' '3.8' '2.5' '4.5' '9.7' '2.8' '3.0' '9.5' '3.1' '1.2'
 '3.5' '2.4' '2.0' '2.3' '1.9' '2.

In [9]:
# Drop rows with specifc value(s)

# Kepp new_df (as backup) and create new_df2
new_df2 = new_df[new_df['Rate'] != 'No Rate']
new_df2 = new_df2[new_df2['Votes'] != 'No Votes']
new_df2 = new_df2[new_df2['Duration'] != 'None']

# According to Wikipedia, 
# if a film has not been submitted for a rating \
# or is an uncut version of a film that was submitted,\
# the labels Not Rated (NR) or Unrated (UR) are often used
# so we will keep rows with "Not Rated" or "Unrated".
# We delete rows with 'None' value, which is equal to Null value
new_df2 = new_df2[new_df2['Certificate'] != 'None']


# Clean the leftover columns with "No rate" value
new_df2 = new_df2[new_df2['Nudity'] != 'No Rate']
new_df2 = new_df2[new_df2['Violence'] != 'No Rate']
new_df2 = new_df2[new_df2['Profanity'] != 'No Rate']
new_df2 = new_df2[new_df2['Alcohol'] != 'No Rate']
new_df2 = new_df2[new_df2['Frightening'] != 'No Rate']

new_df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4255 entries, 0 to 5027
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Name         4255 non-null   object
 1   Year         4255 non-null   int64 
 2   Rate         4255 non-null   object
 3   Votes        4255 non-null   object
 4   Genre        4255 non-null   object
 5   Duration     4255 non-null   object
 6   Type         4255 non-null   object
 7   Certificate  4255 non-null   object
 8   Episodes     4255 non-null   object
 9   Nudity       4255 non-null   object
 10  Violence     4255 non-null   object
 11  Profanity    4255 non-null   object
 12  Alcohol      4255 non-null   object
 13  Frightening  4255 non-null   object
dtypes: int64(1), object(13)
memory usage: 498.6+ KB


In [23]:
new_df2.head()

Unnamed: 0,Name,Year,Rate,Votes,Genre,Duration,Type,Certificate,Episodes,Nudity,Violence,Profanity,Alcohol,Frightening
0,No Time to Die,2021,7.6,107163,"Action, Adventure, Thriller",163,Film,PG-13,-,Mild,Moderate,Mild,Mild,Moderate
1,The Guilty,2021,6.3,64375,"Crime, Drama, Thriller",90,Film,R,-,,,Severe,,Moderate
2,The Many Saints of Newark,2021,6.4,27145,"Crime, Drama",120,Film,R,-,Moderate,Severe,Severe,Moderate,Moderate
3,Venom: Let There Be Carnage,2021,6.4,30443,"Action, Adventure, Sci-Fi",97,Film,PG-13,-,,Moderate,Moderate,Mild,Moderate
4,Dune,2021,8.3,84636,"Action, Adventure, Drama",155,Film,PG-13,-,,Moderate,,Mild,Moderate


In [13]:
#Export cleaned_df to csv file
#new_df2.to_csv('new_df2.csv')

<!-- le = preprocessing.LabelEncoder()
le.fit(df.fruit)
df['categorical_label'] = le.transform(df.fruit) -->

In [46]:
# Transforming string value to numerically categorical value 
test_df = new_df2[['Nudity','Violence',\
    'Profanity','Alcohol','Frightening','Type','Certificate']]
le = preprocessing.LabelEncoder()
for col in ['Nudity','Violence',\
    'Profanity','Alcohol','Frightening','Type','Certificate']:
    le.fit(test_df.loc[:,col])
    test_df[col+'_category'] = le.transform(test_df.loc[:,col])

test_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[col+'_category'] = le.transform(test_df.loc[:,col])


Unnamed: 0,Nudity,Violence,Profanity,Alcohol,Frightening,Type,Certificate,Nudity_category,Violence_category,Profanity_category,Alcohol_category,Frightening_category,Type_category,Certificate_category
0,Mild,Moderate,Mild,Mild,Moderate,Film,PG-13,0,1,0,0,1,0,10
1,,,Severe,,Moderate,Film,R,2,2,3,2,1,0,12
2,Moderate,Severe,Severe,Moderate,Moderate,Film,R,1,3,3,1,1,0,12
3,,Moderate,Moderate,Mild,Moderate,Film,PG-13,2,1,1,0,1,0,10
4,,Moderate,,Mild,Moderate,Film,PG-13,2,1,2,0,1,0,10


In [47]:
test_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 4255 entries, 0 to 5027
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Nudity                4255 non-null   object
 1   Violence              4255 non-null   object
 2   Profanity             4255 non-null   object
 3   Alcohol               4255 non-null   object
 4   Frightening           4255 non-null   object
 5   Type                  4255 non-null   object
 6   Certificate           4255 non-null   object
 7   Nudity_category       4255 non-null   int32 
 8   Violence_category     4255 non-null   int32 
 9   Profanity_category    4255 non-null   int32 
 10  Alcohol_category      4255 non-null   int32 
 11  Frightening_category  4255 non-null   int32 
 12  Type_category         4255 non-null   int32 
 13  Certificate_category  4255 non-null   int32 
dtypes: int32(7), object(7)
memory usage: 511.3+ KB


In [1]:
# import pickle
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor

# df_train = pd.read_csv('cleaned_train.csv')


# # Separating X and y
# X = df_train.drop('SalePrice', axis=1)
# y = df_train['SalePrice']

# # Build random forest model
# model = RandomForestRegressor(n_estimators=100, criterion='mae')
# model.fit(X, y)

# # Saving the model
# pickle.dump(model, open('house_price_model.pkl', 'wb'))