In [2209]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns; sns.set()
import statsmodels.formula.api as sfa
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LinearRegression

pd.options.display.max_columns = None

In [2210]:
netflix_og = pd.read_csv('./NetflixOriginals.csv', sep=',', encoding='Windows-1252')
netflix_og.head(10)

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language
0,Enter the Anime,Documentary,"August 5, 2019",58,2.5,English/Japanese
1,Dark Forces,Thriller,"August 21, 2020",81,2.6,Spanish
2,The App,Science fiction/Drama,"December 26, 2019",79,2.6,Italian
3,The Open House,Horror thriller,"January 19, 2018",94,3.2,English
4,Kaali Khuhi,Mystery,"October 30, 2020",90,3.4,Hindi
5,Drive,Action,"November 1, 2019",147,3.5,Hindi
6,Leyla Everlasting,Comedy,"December 4, 2020",112,3.7,Turkish
7,The Last Days of American Crime,Heist film/Thriller,"June 5, 2020",149,3.7,English
8,Paradox,Musical/Western/Fantasy,"March 23, 2018",73,3.9,English
9,Sardar Ka Grandson,Comedy,"May 18, 2021",139,4.1,Hindi


In [2211]:
netflix_og.columns = [col.lower() for col in netflix_og.columns]
netflix_og.rename({'imdb score': 'imdb-score'}, axis=1, inplace=True)
# netflix_og.columns

In [2212]:
netflix_og.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 584 entries, 0 to 583
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   title       584 non-null    object 
 1   genre       584 non-null    object 
 2   premiere    584 non-null    object 
 3   runtime     584 non-null    int64  
 4   imdb-score  584 non-null    float64
 5   language    584 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 27.5+ KB


In [2213]:
netflix_og['premiere'] = pd.to_datetime(netflix_og['premiere'])
# netflix_og.sample()

In [2214]:
# netflix_og['language'].value_counts()

In [2215]:
def insert_value_into_next_col(values, n):
    l = list()
    for v in values:
        if len(v) > n-1: l.append(v[n-1])
        else: l.append(pd.NA)
    return l

In [2216]:
languages = netflix_og['language'].to_list()
languages = [lang.split('/') for lang in languages]
# languages

In [2217]:
netflix_og.loc[netflix_og['language'].str.count('/') > 1]

Unnamed: 0,title,genre,premiere,runtime,imdb-score,language
343,Tigertail,Drama,2020-04-10,91,6.5,English/Taiwanese/Mandarin
481,First They Killed My Father,Drama,2017-09-15,136,7.2,Khmer/English/French
580,Winter on Fire: Ukraine's Fight for Freedom,Documentary,2015-10-09,91,8.4,English/Ukranian/Russian


In [2218]:
def strip_values(values):
    stripped_values = list()
    for value in values:
        if isinstance(value, list):
            for v in value:
                v = v.strip()
                v = v.replace('\"', '')
                stripped_values.append(v)
        else:
            value = value.strip()
            value = value.replace('\"', '')
            stripped_values.append(value)
    return stripped_values


languages = strip_values(languages)
# set(languages)

In [2219]:
netflix_og['language'] = [l.replace('Thia', 'Thai') for l in netflix_og['language']]
netflix_og.loc[netflix_og['language'].str.contains('Thia', case=False)]

Unnamed: 0,title,genre,premiere,runtime,imdb-score,language


In [2220]:
languages = netflix_og['language'].to_list()
languages = [lang.split('/') for lang in languages]

In [2221]:
language_occurances = dict()
for lang in languages:
    if isinstance(lang, list):
        for l in lang:
            if l in language_occurances.keys(): language_occurances[l] += 1
            else: language_occurances[l] = 1
    else:
        if lang in language_occurances.keys(): language_occurances[lang] += 1
        else: language_occurances[lang] = 1

# for l in sorted(language_occurances.items(), key=lambda x: x[1]):
#     print(l)

In [2222]:
low_language_occurances = [(k, v) for k, v in language_occurances.items() if v < 3]
low_language_occurances = dict(low_language_occurances)
# low_language_occurances

In [2223]:
netflix_og['first-lang'] = insert_value_into_next_col(languages, 1)
netflix_og['second-lang'] = insert_value_into_next_col(languages, 2)
# netflix_og['third-lang'] = insert_value_into_next_col(languages, 3)  # tylko 3 przypadki
netflix_og.sample(10)

Unnamed: 0,title,genre,premiere,runtime,imdb-score,language,first-lang,second-lang
80,"Strip Down, Rise Up",Documentary,2021-02-05,112,5.2,English,English,
424,ReMastered: Who Shot the Sheriff?,Documentary,2018-10-12,57,6.9,English,English,
165,American Son,Drama,2019-11-01,90,5.8,English,English,
33,Porta dos Fundos: The First Temptation of Christ,Comedy,2019-12-03,46,4.6,Portuguese,Portuguese,
476,The Great Hack,Documentary,2019-07-24,114,7.1,English,English,
558,Road to Roma,Making-of,2020-02-11,72,7.7,Spanish,Spanish,
500,El Camino: A Breaking Bad Movie,Crime drama,2019-10-11,121,7.3,English,English,
118,Ride or Die,Psychological thriller drama,2021-04-15,142,5.5,Japanese,Japanese,
448,The Christmas Chronicles,Christmas/Fantasy/Adventure/Comedy,2018-11-22,104,7.0,English,English,
43,Christmas Crossfire,Thriller,2020-12-04,106,4.8,German,German,


In [2224]:
netflix_og['second-lang'].value_counts()

Spanish      5
English      3
Japanese     2
Mandarin     2
Hindi        2
Basque       1
Catalan      1
Swedish      1
Taiwanese    1
Korean       1
Arabic       1
Russian      1
Akan         1
Ukranian     1
Name: second-lang, dtype: int64

In [2225]:
def replace_lang_to_other(lang):
    return 'Other' if lang in low_language_occurances.keys() else lang

netflix_og['first-lang'] = netflix_og['first-lang'].apply(replace_lang_to_other)
# print(netflix_og.loc[netflix_og['first-lang'].isin(low_language_occurances.keys())])

netflix_og['second-lang'] = netflix_og['second-lang'].apply(replace_lang_to_other)
# print(netflix_og.loc[netflix_og['second-lang'].isin(low_language_occurances.keys())])

In [2226]:
netflix_og.sample()

Unnamed: 0,title,genre,premiere,runtime,imdb-score,language,first-lang,second-lang
413,AK vs AK,Thriller,2020-12-24,108,6.9,Hindi,Hindi,


In [2227]:
netflix_og['first-lang'].value_counts()

English       419
Spanish        34
Hindi          33
French         20
Italian        14
Portuguese     12
Indonesian      9
Other           9
Korean          6
Japanese        6
Turkish         5
German          5
Dutch           3
Polish          3
Marathi         3
Thai            3
Name: first-lang, dtype: int64

In [2228]:
netflix_og['second-lang'].value_counts()

Other       8
Spanish     5
English     3
Japanese    2
Mandarin    2
Hindi       2
Korean      1
Name: second-lang, dtype: int64

In [2229]:
netflix_og.loc[~netflix_og['second-lang'].isin([pd.NA])]

Unnamed: 0,title,genre,premiere,runtime,imdb-score,language,first-lang,second-lang
0,Enter the Anime,Documentary,2019-08-05,58,2.5,English/Japanese,English,Japanese
30,After Maria,Documentary,2019-05-24,37,4.6,English/Spanish,English,Spanish
126,Bomb Scared,Black comedy,2017-10-12,89,5.6,Spanish/Basque,Spanish,Other
263,A Tale of Two Kitchens,Documentary,2019-05-22,30,6.3,English/Spanish,English,Spanish
287,The Outsider,Crime drama,2018-03-09,120,6.3,English/Japanese,English,Japanese
295,Birders,Documentary,2019-09-25,37,6.4,English/Spanish,English,Spanish
316,Two Catalonias,Documentary,2018-09-28,116,6.4,Spanish/Catalan,Spanish,Other
320,A 3 Minute Hug,Documentary,2019-10-28,28,6.5,English/Spanish,English,Spanish
334,Life Overtakes Me,Documentary,2019-06-14,40,6.5,English/Swedish,English,Other
343,Tigertail,Drama,2020-04-10,91,6.5,English/Taiwanese/Mandarin,English,Other


In [2230]:
netflix_og.loc[netflix_og['genre'].str.count('/') > 1]

Unnamed: 0,title,genre,premiere,runtime,imdb-score,language,first-lang,second-lang
8,Paradox,Musical/Western/Fantasy,2018-03-23,73,3.9,English,English,
94,A Babysitter's Guide to Monster Hunting,Comedy/Fantasy/Family,2020-10-15,98,5.4,English,English,
306,Over the Moon,Animation/Musical/Adventure,2020-10-23,95,6.4,English,English,
314,The Willoughbys,Animation/Comedy/Adventure,2020-04-22,90,6.4,English,English,
448,The Christmas Chronicles,Christmas/Fantasy/Adventure/Comedy,2018-11-22,104,7.0,English,English,
573,Klaus,Animation/Christmas/Comedy/Adventure,2019-11-15,97,8.2,English,English,


In [2231]:
netflix_og.loc[netflix_og['genre'].str.contains('Adventure')]

Unnamed: 0,title,genre,premiere,runtime,imdb-score,language,first-lang,second-lang
122,The Legacy of a Whitetail Deer Hunter,Adventure/Comedy,2018-07-06,83,5.5,English,English,
227,Holiday in the Wild,Adventure-romance,2019-11-01,85,6.1,English,English,
232,Pee-wee's Big Holiday,Adventure,2016-03-18,89,6.1,English,English,
306,Over the Moon,Animation/Musical/Adventure,2020-10-23,95,6.4,English,English,
314,The Willoughbys,Animation/Comedy/Adventure,2020-04-22,90,6.4,English,English,
337,Mowgli: Legend of the Jungle,Adventure,2018-12-07,104,6.5,English,English,
448,The Christmas Chronicles,Christmas/Fantasy/Adventure/Comedy,2018-11-22,104,7.0,English,English,
573,Klaus,Animation/Christmas/Comedy/Adventure,2019-11-15,97,8.2,English,English,


In [2232]:
netflix_og.loc[netflix_og['genre'].str.contains('Christmas')]

Unnamed: 0,title,genre,premiere,runtime,imdb-score,language,first-lang,second-lang
68,Dolly Parton's Christmas on the Square,Christmas musical,2020-11-22,98,5.2,English,English,
215,The Christmas Chronicles: Part Two,Christmas comedy,2020-11-25,115,6.0,English,English,
333,Jingle Jangle: A Christmas Journey,Family/Christmas musical,2020-11-13,119,6.5,English,English,
448,The Christmas Chronicles,Christmas/Fantasy/Adventure/Comedy,2018-11-22,104,7.0,English,English,
573,Klaus,Animation/Christmas/Comedy/Adventure,2019-11-15,97,8.2,English,English,


In [2233]:
genres_to_change = {
    'Documentary': ['Documentary'],
    'Thriller': ['Thriller'],
    'Science fiction/Drama': ['Science fiction', 'Drama'],
    'Horror thriller': ['Horror', 'Thriller'],
    'Mystery': ['Crime'],
    'Action': ['Action'],
    'Comedy': ['Comedy'],
    'Heist film/Thriller': ['Crime', 'Thriller'],
    'Musical/Western/Fantasy': ['Western', 'Musical'],
    'Drama': ['Drama'],
    'Romantic comedy': ['Romantic', 'Comedy'],
    'Action comedy': ['Action', 'Comedy'],
    'Horror anthology': ['Horror'],
    'Political thriller': ['Thriller'],
    'Superhero-Comedy': ['Superhero', 'Comedy'],
    'Horror': ['Horror'],
    'Romance drama': ['Romantic', 'Drama'],
    'Anime / Short': ['Anime', 'Short'],
    'Superhero': ['Superhero'],
    'Heist': ['Crime'],
    'Western': ['Western'],
    'Animation/Superhero': ['Superhero', 'Animation'],
    'Family film': ['Family'],
    'Action-thriller': ['Action', 'Thriller'],
    'Teen comedy-drama': ['Comedy', 'Drama'],
    'Romantic drama': ['Romantic', 'Drama'],
    'Animation': ['Animation'],
    'Aftershow / Interview': ['Aftershow interview'],
    'Christmas musical': ['Christmas', 'Musical'],
    'Science fiction adventure': ['Science fiction', 'Adventure'],
    'Science fiction': ['Science fiction'],
    'Variety show': ['Variety show'],
    'Comedy-drama': ['Comedy', 'Drama'],
    'Comedy/Fantasy/Family': ['Comedy', 'Fantasy', 'Family'],
    'Supernatural drama': ['Horror', 'Drama'],
    'Action/Comedy': ['Action', 'Comedy'],
    'Action/Science fiction': ['Action', 'Science fiction'],
    'Romantic teenage drama': ['Romantic', 'Drama'],
    'Comedy / Musical': ['Comedy', 'Musical'],
    'Musical': ['Musical'],
    'Science fiction/Mystery': ['Science fiction', 'Crime'],
    'Crime drama': ['Crime', 'Drama'],
    'Psychological thriller drama': ['Psychological', 'Thriller', 'Drama'],
    'Adventure/Comedy': ['Adventure', 'Comedy'],
    'Black comedy': ['Dark comedy'],
    'Romance': ['Romantic'],
    'Horror comedy': ['Horror', 'Comedy'],
    'Christian musical': ['Musical'],
    'Romantic teen drama': ['Romantic', 'Drama'],
    'Family': ['Family'],
    'Dark comedy': ['Dark comedy'],
    'Comedy horror': ['Comedy', 'Horror'],
    'Psychological thriller': ['Psychological', 'Thriller'],
    'Biopic': ['Biographical'],
    'Science fiction/Thriller': ['Science fiction', 'Thriller'],
    'Mockumentary': ['Comedy'],
    'Satire': ['Comedy'],
    'One-man show': ['One-man show'],
    'Romantic comedy-drama': ['Romantic', 'Comedy', 'Drama'],
    'Comedy/Horror': ['Comedy', 'Horror'],
    'Fantasy': ['Fantasy'],
    'Sports-drama': ['Sport', 'Drama'],
    'Zombie/Heist': ['Horror', 'Crime'],
    'Psychological horror': ['Psychological', 'Horror'],
    'Sports film': ['Sport'],
    'Comedy mystery': ['Comedy', 'Crime'],
    'Romantic thriller': ['Romantic', 'Thriller'],
    'Christmas comedy': ['Christmas', 'Comedy'],
    'War-Comedy': ['War', 'Comedy'],
    'Romantic comedy/Holiday': ['Romantic', 'Comedy'],
    'Adventure-romance': ['Romantic', 'Adventure'],
    'Adventure': ['Adventure'],
    'Horror-thriller': ['Horror', 'Thriller'],
    'Dance comedy': ['Comedy'],
    'Stop Motion': ['Animation'],
    'Horror/Crime drama': ['Horror', 'Crime', 'Drama'],
    'Urban fantasy': ['Fantasy'],
    'Drama/Horror': ['Horror', 'Drama'],
    'Family/Comedy-drama': ['Family', 'Comedy', 'Drama'],
    'War': ['War'],
    'Crime thriller': ['Crime', 'Thriller'],
    'Science fiction/Action': ['Science fiction', 'Action'],
    'Teen comedy horror': ['Comedy', 'Horror'],
    'Concert Film': ['Concert'],
    'Musical comedy': ['Musical', 'Comedy'],
    'Animation/Musical/Adventure': ['Musical', 'Animation', 'Adventure'],
    'Animation / Musicial': ['Musical', 'Animation'],
    'Animation/Comedy/Adventure': ['Animation', 'Comedy', 'Adventure'],
    'Action thriller': ['Action', 'Thriller'],
    'Anime/Science fiction': ['Anime', 'Science fiction'],
    'Animation / Short': ['Animation', 'Short'],
    'War drama': ['War', 'Drama'],
    'Family/Christmas musical': ['Christmas', 'Musical', 'Family'],
    'Science fiction thriller': ['Science fiction', 'Thriller'],
    'Drama / Short': ['Drama', 'Short'],
    'Hidden-camera prank comedy': ['Comedy'],
    'Spy thriller': ['Thriller'],
    'Anime/Fantasy': ['Anime', 'Fantasy'],
    'Animated musical comedy': ['Animation', 'Musical'],
    'Variety Show': ['Variety show'],
    'Superhero/Action': ['Superhero', 'Action'],
    'Biographical/Comedy': ['Biographical', 'Comedy'],
    'Historical-epic': ['Historical'],
    'Animation / Comedy': ['Animation', 'Comedy'],
    'Christmas/Fantasy/Adventure/Comedy': ['Christmas', 'Comedy', 'Adventure'],
    'Mentalism special': ['Documentary', 'Thriller'],
    'Drama-Comedy': ['Comedy', 'Drama'],
    'Coming-of-age comedy-drama': ['Comedy', 'Drama'],
    'Historical drama': ['Historical', 'Drama'],
    'Making-of': ['Documentary'],
    'Action-adventure': ['Action', 'Adventure'],
    'Animation / Science Fiction': ['Animation', 'Science fiction'],
    'Anthology/Dark comedy': ['Dark comedy'],
    'Musical / Short': ['Musical', 'Short'],
    'Animation/Christmas/Comedy/Adventure': ['Christmas', 'Animation', 'Adventure']
}

In [2234]:
def insert_subgenre(n):
    l = list()
    for genres in netflix_og['genre']:
        cur_genres = genres_to_change[genres]
        if len(cur_genres) < n: l.append(pd.NA)
        else:
            if isinstance(cur_genres, list): l.append(cur_genres[n-1])
            else: l.append(cur_genres)
    return l


netflix_og['first-genre'] = insert_subgenre(1)
netflix_og['second-genre'] = insert_subgenre(2)
netflix_og['third-genre'] = insert_subgenre(3)
# netflix_og.sample(15)

In [2235]:
# netflix_og['first-genre'].value_counts()

In [2236]:
# netflix_og['second-genre'].value_counts()

In [2237]:
# netflix_og['third-genre'].value_counts()

In [2238]:
netflix_og['premiere-year'] = pd.DatetimeIndex(netflix_og['premiere']).year
netflix_og['premiere-year-q'] = pd.DatetimeIndex(netflix_og['premiere']).month

months_to_quarters = dict(zip(['Q1','Q2','Q3','Q4'], [[1,2,3],[4,5,6],[7,8,9],[10,11,12]]))


def year_month_to_q(month):
    for k, v in months_to_quarters.items():
        if month in v: return k


netflix_og['premiere-year-q'] = netflix_og['premiere-year-q'].apply(year_month_to_q)
# netflix_og.sample(10)

In [2239]:
# netflix_og.columns

In [2240]:
scaler = MinMaxScaler()
netflix_og['runtime-minmax'] = scaler.fit_transform(netflix_og[['runtime']])
netflix_og['runtime-sigm'] = 1 / (1 + np.exp(-netflix_og['runtime-minmax']))

In [2241]:
netflix_og = netflix_og[
    ['title', 'genre', 'first-genre', 'second-genre', 'third-genre',
    'premiere', 'premiere-year', 'premiere-year-q', 'runtime',
    'runtime-minmax', 'runtime-sigm', 'language',
    'first-lang', 'second-lang', 'imdb-score']
]
netflix_og.sample()

Unnamed: 0,title,genre,first-genre,second-genre,third-genre,premiere,premiere-year,premiere-year-q,runtime,runtime-minmax,runtime-sigm,language,first-lang,second-lang,imdb-score
375,Hope Frozen: A Quest to Live Twice,Documentary,Documentary,,,2020-09-15,2020,Q3,80,0.370732,0.591636,Thai/English,Thai,English,6.7


In [2242]:
# sns.pairplot(netflix_og)

In [2243]:
# netflix_og['second-genre'].describe()

In [2325]:
netflix_og_disp = netflix_og.copy()
netflix_og_disp.fillna('None', inplace=True)
netflix_og_disp.head()

Unnamed: 0,title,genre,first-genre,second-genre,third-genre,premiere,premiere-year,premiere-year-q,runtime,runtime-minmax,runtime-sigm,language,first-lang,second-lang,imdb-score
0,Enter the Anime,Documentary,Documentary,,,2019-08-05,2019,Q3,58,0.263415,0.565475,English/Japanese,English,Japanese,2.5
1,Dark Forces,Thriller,Thriller,,,2020-08-21,2020,Q3,81,0.37561,0.592814,Spanish,Spanish,,2.6
2,The App,Science fiction/Drama,Science fiction,Drama,,2019-12-26,2019,Q4,79,0.365854,0.590457,Italian,Italian,,2.6
3,The Open House,Horror thriller,Horror,Thriller,,2018-01-19,2018,Q1,94,0.439024,0.608027,English,English,,3.2
4,Kaali Khuhi,Mystery,Crime,,,2020-10-30,2020,Q4,90,0.419512,0.603367,Hindi,Hindi,,3.4


In [2322]:
fig = px.scatter(netflix_og_disp,
    y='imdb-score', x='first-genre', color='second-genre', symbol='third-genre',
    hover_name='title', labels={'value':'genre'},
    title='IMDB Score by genre', height=800
)
fig.update_traces(marker=dict(size=8, line=dict(width=1)))
fig.show()

In [2246]:
fig = px.histogram(
    netflix_og_disp.groupby(by=['first-lang', 'second-lang'], as_index=False)['imdb-score'].mean(),
    y='imdb-score', x='first-lang',
    color='second-lang',
    title='IMDB Score mean by languages',
    height=400,
    barmode='group')
fig.update_layout(yaxis_title='imdb-score mean')
fig.show()

In [2247]:
fig = px.box(netflix_og_disp,
    y='imdb-score', x='premiere-year-q',
    facet_col='premiere-year',
    color='premiere-year-q',
    category_orders={
        'premiere-year': [2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021],
        'premiere-year-q': ['Q1', 'Q2', 'Q3', 'Q4']
    }
)
fig.show()

In [2248]:
fig = px.scatter(netflix_og_disp,
    y='imdb-score', x='runtime-sigm',
    color='first-genre',
    symbol='second-genre',
    hover_data=['title', 'first-genre', 'second-genre', 'runtime'],
    height=600,
    title='IMDB Score by sigm(runtime) and genres'
)
fig.update_traces(marker=dict(size=6, line=dict(width=1)))
fig.update_layout(legend_title='Movie genres (1st, 2nd)')
fig.show()

In [2249]:
fig = px.scatter(netflix_og_disp,
    y='imdb-score', x='runtime',
    color='first-genre',
    symbol='second-genre',
    hover_data=['title', 'first-genre', 'second-genre', 'runtime'],
    height=600,
    title='IMDB Score by runtime and genres'
)
fig.update_traces(marker=dict(size=6, line=dict(width=1)))
fig.update_layout(legend_title='Movie genres (1st, 2nd)')
fig.show()

In [2323]:
fig = px.bar(
    netflix_og_disp.groupby(by=['first-genre', 'second-genre'], as_index=False)['runtime'].mean(),
    y='runtime', x='first-genre',
    color='second-genre',
    title='Runtime mean by genres',
    height=500, orientation='v',
    barmode='group')
fig.update_layout(yaxis_title='runtime mean')
fig.show()

In [2251]:
le = LabelEncoder()
nogcp = netflix_og[
    ['title', 'first-genre', 'second-genre', 'third-genre',
    'premiere-year', 'premiere-year-q', 'runtime', 'runtime-minmax',
    'runtime-sigm', 'first-lang', 'second-lang', 'imdb-score']
]
nogcp['first-genre'] = nogcp['first-genre'].astype('category')
nogcp['second-genre'] = nogcp['second-genre'].astype('category')
nogcp['third-genre'] = nogcp['third-genre'].astype('category')
nogcp['premiere-year-q'] = nogcp['premiere-year-q'].astype('category')
nogcp['first-lang'] = nogcp['first-lang'].astype('category')
nogcp['second-lang'] = nogcp['second-lang'].astype('category')
# nogcp



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [2252]:
nogcp['first-genre-enc'] = le.fit_transform(nogcp['first-genre']).astype('str')
nogcp['second-genre-enc'] = le.fit_transform(nogcp['second-genre']).astype('str')
nogcp['third-genre-enc'] = le.fit_transform(nogcp['third-genre']).astype('str')
nogcp['premiere-year-q-enc'] = le.fit_transform(nogcp['premiere-year-q']).astype('str')
nogcp['first-lang-enc'] = le.fit_transform(nogcp['first-lang']).astype('str')
nogcp['second-lang-enc'] = le.fit_transform(nogcp['second-lang']).astype('str')
# nogcp



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [2253]:
# nogcp.info()

In [2254]:
X = nogcp.drop([
    'imdb-score', 'first-genre', 'second-genre', 'third-genre',
    'title', 'first-lang', 'second-lang', 'premiere-year-q'], axis=1)
y = nogcp['imdb-score']
# X.columns

In [2255]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
y_test.name = 'true imdb-score'
y_test.size

117

In [2256]:
rf_tuning = RandomForestRegressor(random_state=0)
param_grid = {
    'n_estimators': [100, 200, 250, 500],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [4, 5, 6, 7],
    'criterion': ['mse']
}
GSCV = GridSearchCV(estimator=rf_tuning, param_grid=param_grid, n_jobs=8, refit=True)
GSCV.fit(X_train, y_train)
GSCV.best_params_

{'criterion': 'mse',
 'max_depth': 4,
 'max_features': 'auto',
 'n_estimators': 100}

In [2257]:
sel = SelectFromModel(
    rf := RandomForestRegressor(n_estimators=100, criterion='mse', max_depth=4, max_features='auto', random_state=0)
)

rf.fit(X_train, y_train)
prediction = rf.predict(X_test)
print('RMSE:', np.sqrt(mean_squared_error(y_test, prediction)))

sel.fit(X_train, y_train)
selected_features = X_train.columns[(sel.get_support())]
print('most important features =', list(selected_features))

for i, v in enumerate(rf.feature_importances_):
    print(X_train.columns[i], '=', v)

RMSE: 0.800148160513829
most important features = ['first-genre-enc']
premiere-year = 0.029112158370295505
runtime = 0.08301893753696378
runtime-minmax = 0.08785887230568969
runtime-sigm = 0.09160984203167664
first-genre-enc = 0.5176012413903454
second-genre-enc = 0.036797912454292515
third-genre-enc = 0.003577990557133589
premiere-year-q-enc = 0.03476281206918453
first-lang-enc = 0.08250775500926892
second-lang-enc = 0.03315247827514944


In [2258]:
regressor = RandomForestRegressor(n_estimators=100, max_depth=4, max_features='auto', random_state=0)
regressor.fit(X_train, y_train)

RandomForestRegressor(max_depth=4, random_state=0)

In [2259]:
prediction = regressor.predict(X_test)
RMSE = np.sqrt(mean_squared_error(y_test, prediction))

In [2260]:
pred_ser = pd.Series(prediction, name='predicted imdb-score')
pred_ser.index = y_test.index

In [2261]:
fig = go.Figure((go.Scatter(x=y_test.index, y=y_test, mode='markers', name='true imdb-score')))
fig.add_trace(go.Scatter(x=pred_ser.index, y=pred_ser, mode='markers', name='predicted imdb-score'))
fig.update_layout(title=f'Prediction and true imdb-score, test_size=0.2, estimators=100, {RMSE=}')
fig.show()

In [2262]:
regressor = RandomForestRegressor(n_estimators=200, random_state=0)
regressor.fit(X_train, y_train)
prediction = regressor.predict(X_test)
RMSE = np.sqrt(mean_squared_error(y_test, prediction))

pred_ser = pd.Series(prediction, name='predicted imdb-score')
pred_ser.index = y_test.index

fig = go.Figure((go.Scatter(x=y_test.index, y=y_test, mode='markers', name='true imdb-score')))
fig.add_trace(go.Scatter(x=pred_ser.index, y=pred_ser, mode='markers', name='predicted imdb-score'))
fig.update_layout(title=f'Prediction and true imdb-score, test_size=0.2, estimators=200, {RMSE=}')
fig.show()

In [2263]:
regressor = RandomForestRegressor(n_estimators=250, random_state=0)
regressor.fit(X_train, y_train)
prediction = regressor.predict(X_test)
RMSE = np.sqrt(mean_squared_error(y_test, prediction))

pred_ser = pd.Series(prediction, name='predicted imdb-score')
pred_ser.index = y_test.index

fig = go.Figure((go.Scatter(x=y_test.index, y=y_test, mode='markers', name='true imdb-score')))
fig.add_trace(go.Scatter(x=pred_ser.index, y=pred_ser, mode='markers', name='predicted imdb-score'))
fig.update_layout(title=f'Prediction and true imdb-score, test_size=0.2, estimators=250, {RMSE=}')
fig.show()

In [2264]:
regressor = RandomForestRegressor(n_estimators=500, random_state=0)
regressor.fit(X_train, y_train)
prediction = regressor.predict(X_test)
RMSE = np.sqrt(mean_squared_error(y_test, prediction))

pred_ser = pd.Series(prediction, name='predicted imdb-score')
pred_ser.index = y_test.index

fig = go.Figure((go.Scatter(x=y_test.index, y=y_test, mode='markers', name='true imdb-score')))
fig.add_trace(go.Scatter(x=pred_ser.index, y=pred_ser, mode='markers', name='predicted imdb-score'))
fig.update_layout(title=f'Prediction and true imdb-score, test_size=0.2, estimators=500, {RMSE=}')
fig.show()

In [2265]:
netflix_titles = pd.read_csv('./netflix_titles.csv')
netflix_titles.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [2266]:
netflix_titles = netflix_titles[['title', 'director', 'country', 'rating']]
netflix_titles.head()

Unnamed: 0,title,director,country,rating
0,Dick Johnson Is Dead,Kirsten Johnson,United States,PG-13
1,Blood & Water,,South Africa,TV-MA
2,Ganglands,Julien Leclercq,,TV-MA
3,Jailbirds New Orleans,,,TV-MA
4,Kota Factory,,India,TV-MA


In [2267]:
# netflix_titles.info()

In [2268]:
# nogcp.head()

In [2269]:
# nogcp.columns

In [2270]:
netflix_all = nogcp.merge(netflix_titles, on='title', how='left')
netflix_all.head()

Unnamed: 0,title,first-genre,second-genre,third-genre,premiere-year,premiere-year-q,runtime,runtime-minmax,runtime-sigm,first-lang,second-lang,imdb-score,first-genre-enc,second-genre-enc,third-genre-enc,premiere-year-q-enc,first-lang-enc,second-lang-enc,director,country,rating
0,Enter the Anime,Documentary,,,2019,Q3,58,0.263415,0.565475,English,Japanese,2.5,11,12,3,2,1,2,Alex Burunova,"United States, Japan",TV-MA
1,Dark Forces,Thriller,,,2020,Q3,81,0.37561,0.592814,Spanish,,2.6,24,12,3,2,13,7,Bernardo Arellano,Mexico,TV-MA
2,The App,Science fiction,Drama,,2019,Q4,79,0.365854,0.590457,Italian,,2.6,21,5,3,3,6,7,Elisa Fuksas,Italy,TV-MA
3,The Open House,Horror,Thriller,,2018,Q1,94,0.439024,0.608027,English,,3.2,16,11,3,0,1,7,"Matt Angel, Suzanne Coote","Canada, United States",TV-MA
4,Kaali Khuhi,Crime,,,2020,Q4,90,0.419512,0.603367,Hindi,,3.4,9,12,3,3,4,7,Terrie Samundra,India,TV-14


In [2271]:
netflix_all = netflix_all[
    ['title', 'first-genre', 'first-genre-enc', 'second-genre', 'second-genre-enc',
    'third-genre', 'third-genre-enc', 'premiere-year', 'premiere-year-q', 'premiere-year-q-enc',
    'first-lang', 'first-lang-enc', 'second-lang', 'second-lang-enc', 'director', 'country',
    'runtime', 'runtime-minmax', 'runtime-sigm', 'rating', 'imdb-score']
]
netflix_all.head()

Unnamed: 0,title,first-genre,first-genre-enc,second-genre,second-genre-enc,third-genre,third-genre-enc,premiere-year,premiere-year-q,premiere-year-q-enc,first-lang,first-lang-enc,second-lang,second-lang-enc,director,country,runtime,runtime-minmax,runtime-sigm,rating,imdb-score
0,Enter the Anime,Documentary,11,,12,,3,2019,Q3,2,English,1,Japanese,2,Alex Burunova,"United States, Japan",58,0.263415,0.565475,TV-MA,2.5
1,Dark Forces,Thriller,24,,12,,3,2020,Q3,2,Spanish,13,,7,Bernardo Arellano,Mexico,81,0.37561,0.592814,TV-MA,2.6
2,The App,Science fiction,21,Drama,5,,3,2019,Q4,3,Italian,6,,7,Elisa Fuksas,Italy,79,0.365854,0.590457,TV-MA,2.6
3,The Open House,Horror,16,Thriller,11,,3,2018,Q1,0,English,1,,7,"Matt Angel, Suzanne Coote","Canada, United States",94,0.439024,0.608027,TV-MA,3.2
4,Kaali Khuhi,Crime,9,,12,,3,2020,Q4,3,Hindi,4,,7,Terrie Samundra,India,90,0.419512,0.603367,TV-14,3.4


In [2272]:
# netflix_all['director']

In [2273]:
# netflix_all['director'] = netflix_all['director'].astype('string')
# netflix_all['director']

In [2274]:
netflix_all['director'].fillna('None', inplace=True)

In [2275]:
def clean_and_count_values(df, column):
    values = df[column]
    values = [v.split(',') for v in values]
    value_occurances = dict()
    new_values = list()

    for value in values:
        if isinstance(value, list):
            temp = list()
            for i, v in enumerate(value):
                v = v.replace('\"', '')
                v = v.strip()
                if v in value_occurances.keys(): value_occurances[v] += 1
                else: value_occurances[v] = 1
                temp.append([v, value_occurances[v]])
            new_values.append(temp)
        else:
            value = value.replace('\"', '')
            value = value.strip()
            if value in value_occurances.keys(): value_occurances[value] += 1
            else: value_occurances[value] = 1
            new_values.append([value, value_occurances[value]])
    
    for idx, value in enumerate(new_values):
        value = value[0]
        if isinstance(value[0], list):
            for i, v in enumerate(value):
                new_values[idx][i][1] = value_occurances[v]
                print(new_values[idx])
        else: new_values[idx][0][1] = value_occurances[value[0]]

    return new_values


directors = clean_and_count_values(netflix_all, 'director')
# directors[:10]

In [2276]:
def sort_counted_values(values):
    sorted_values = list()
    for value in values:
        sorted_values.append(sorted(value, key=lambda x: x[1], reverse=True))
    return sorted_values

directors = sort_counted_values(directors)
# directors[:30]

In [2277]:
directors_sorted_with_occ = list(zip(netflix_all['director'], directors))
# directors_sorted_with_occ

In [2278]:
def fill_directors():
    first_dirs = list()
    second_dirs = list()
    for idx in range(len(netflix_all['director'])):
        if directors_sorted_with_occ[idx][1][0][0] == 'None':
            first_dirs.append(pd.NA)
        else:
            first_dirs.append(directors_sorted_with_occ[idx][1][0][0])
        if len(directors_sorted_with_occ[idx][1]) > 1:
            second_dirs.append(directors_sorted_with_occ[idx][1][1][0])
        else: second_dirs.append(pd.NA)
    return first_dirs, second_dirs

first_dirs, second_dirs = fill_directors()
# first_dirs[:10], second_dirs[:10], list(netflix_all['director'][:10])

In [2279]:
netflix_all['first-dir'] = first_dirs
netflix_all['second-dir'] = second_dirs
# netflix_all['first-dir'].value_counts(), netflix_all['second-dir'].value_counts()
# netflix_all[-10:]

In [2280]:
netflix_all.replace(np.nan, 'None', inplace=True)
netflix_all['country'] = netflix_all['country'].astype('string')
netflix_all['country'] = netflix_all['country'].apply(lambda x: 'None' if x == None else x)
# netflix_all['country'] = netflix_all['country'].astype('category')
# netflix_all['country']

In [2281]:
countries = clean_and_count_values(netflix_all, 'country')
sorted_countries_with_occurances = sort_counted_values(countries)
# sorted_countries_with_occurances

In [2282]:
netflix_all['first-country'] = [c[0][0] for c in sorted_countries_with_occurances]
netflix_all['second-country'] = [c[1][0] if len(c) > 1 else pd.NA for c in sorted_countries_with_occurances]
netflix_all['third-country'] = [c[2][0] if len(c) > 2 else pd.NA for c in sorted_countries_with_occurances]

In [2283]:
# netflix_all.sample(15)

In [2284]:
netflix_all.replace('None', np.NaN, inplace=True)


elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison



In [2285]:
netflix_all.rename({'director':'directors', 'country':'countries'}, axis=1, inplace=True)

In [2286]:
# netflix_all.info()

In [2287]:
netflix_all['rating'] = netflix_all['rating'].astype('category')
netflix_all['first-dir'] = netflix_all['first-dir'].astype('category')
netflix_all['second-dir'] = netflix_all['second-dir'].astype('category')
netflix_all['first-country'] = netflix_all['first-country'].astype('category')
netflix_all['second-country'] = netflix_all['second-country'].astype('category')
netflix_all['third-country'] = netflix_all['third-country'].astype('category')
# netflix_all.info()

In [2288]:
# netflix_all.head(10)

In [2289]:
netflix_all['rating-enc'] = le.fit_transform(netflix_all['rating']).astype('str')
netflix_all['first-dir-enc'] = le.fit_transform(netflix_all['first-dir']).astype('str')
netflix_all['second-dir-enc'] = le.fit_transform(netflix_all['second-dir']).astype('str')
netflix_all['first-country-enc'] = le.fit_transform(netflix_all['first-country']).astype('str')
netflix_all['second-country-enc'] = le.fit_transform(netflix_all['second-country']).astype('str')
netflix_all['third-country-enc'] = le.fit_transform(netflix_all['third-country']).astype('str')
netflix_all.replace('None', np.NaN, inplace=True)
# netflix_all.columns

In [2290]:
netflix_all = netflix_all[
    ['title', 'first-genre', 'first-genre-enc', 'second-genre',
       'second-genre-enc', 'third-genre', 'third-genre-enc', 'premiere-year',
       'premiere-year-q', 'premiere-year-q-enc', 'runtime', 'runtime-minmax',
       'runtime-sigm', 'first-lang', 'first-lang-enc', 'second-lang',
       'second-lang-enc', 'directors', 'first-dir', 'first-dir-enc',
       'second-dir', 'second-dir-enc', 'countries', 'first-country', 'first-country-enc',
       'second-country', 'second-country-enc', 'third-country', 'third-country-enc',
       'rating', 'rating-enc', 'imdb-score']
]
netflix_all.sample()

Unnamed: 0,title,first-genre,first-genre-enc,second-genre,second-genre-enc,third-genre,third-genre-enc,premiere-year,premiere-year-q,premiere-year-q-enc,runtime,runtime-minmax,runtime-sigm,first-lang,first-lang-enc,second-lang,second-lang-enc,directors,first-dir,first-dir-enc,second-dir,second-dir-enc,countries,first-country,first-country-enc,second-country,second-country-enc,third-country,third-country-enc,rating,rating-enc,imdb-score
177,Rich in Love,Romantic,20,Comedy,3,,3,2020,Q2,1,105,0.492683,0.620738,Portuguese,12,,7,Bruno Garotti,Bruno Garotti,56,,31,Brazil,Brazil,3,,31,,11,TV-14,4,5.8


In [2291]:
enc_columns = list()
for col in netflix_all.columns:
    if '-enc' in col:
        enc_columns.append(col)
# enc_columns

In [2292]:
enc_columns_base = [col.replace('-enc', '') for col in enc_columns]
# enc_columns_base

In [2293]:
netflix_disp = netflix_all.copy()

for i, col in enumerate(enc_columns):
    netflix_all[col] = netflix_all[col].astype('category')
    netflix_all[col] = le.fit_transform(netflix_all[enc_columns_base[i]]).astype('str')
    netflix_disp[col] = le.fit_transform(netflix_all[enc_columns_base[i]]).astype('str')

# netflix_all.head()

In [2294]:
# enc_columns_base

In [2324]:
for col in enc_columns_base:
    netflix_disp[col] = netflix_disp[col].astype('string')

netflix_disp['countries'] = netflix_disp[col].astype('string')
netflix_disp.fillna('None', inplace=True)
netflix_disp.head()

Unnamed: 0,title,first-genre,first-genre-enc,second-genre,second-genre-enc,third-genre,third-genre-enc,premiere-year,premiere-year-q,premiere-year-q-enc,runtime,runtime-minmax,runtime-sigm,first-lang,first-lang-enc,second-lang,second-lang-enc,directors,first-dir,first-dir-enc,second-dir,second-dir-enc,countries,first-country,first-country-enc,second-country,second-country-enc,third-country,third-country-enc,rating,rating-enc,imdb-score
0,Enter the Anime,Documentary,11,,12,,3,2019,Q3,2,58,0.263415,0.565475,English,1,Japanese,2,Alex Burunova,Alex Burunova,12,,47,TV-MA,United States,26,Japan,17,,11,TV-MA,5,2.5
1,Dark Forces,Thriller,24,,12,,3,2020,Q3,2,81,0.37561,0.592814,Spanish,13,,7,Bernardo Arellano,Bernardo Arellano,42,,47,TV-MA,Mexico,13,,31,,11,TV-MA,5,2.6
2,The App,Science fiction,21,Drama,5,,3,2019,Q4,3,79,0.365854,0.590457,Italian,6,,7,Elisa Fuksas,Elisa Fuksas,124,,47,TV-MA,Italy,10,,31,,11,TV-MA,5,2.6
3,The Open House,Horror,16,Thriller,11,,3,2018,Q1,0,94,0.439024,0.608027,English,1,,7,"Matt Angel, Suzanne Coote",Matt Angel,275,Suzanne Coote,40,TV-MA,Canada,4,United States,30,,11,TV-MA,5,3.2
4,Kaali Khuhi,Crime,9,,12,,3,2020,Q4,3,90,0.419512,0.603367,Hindi,4,,7,Terrie Samundra,Terrie Samundra,422,,47,TV-14,India,8,,31,,11,TV-14,3,3.4


In [2296]:
netflix_all.columns

Index(['title', 'first-genre', 'first-genre-enc', 'second-genre',
       'second-genre-enc', 'third-genre', 'third-genre-enc', 'premiere-year',
       'premiere-year-q', 'premiere-year-q-enc', 'runtime', 'runtime-minmax',
       'runtime-sigm', 'first-lang', 'first-lang-enc', 'second-lang',
       'second-lang-enc', 'directors', 'first-dir', 'first-dir-enc',
       'second-dir', 'second-dir-enc', 'countries', 'first-country',
       'first-country-enc', 'second-country', 'second-country-enc',
       'third-country', 'third-country-enc', 'rating', 'rating-enc',
       'imdb-score'],
      dtype='object')

In [2297]:
netflix_disp.columns

Index(['title', 'first-genre', 'first-genre-enc', 'second-genre',
       'second-genre-enc', 'third-genre', 'third-genre-enc', 'premiere-year',
       'premiere-year-q', 'premiere-year-q-enc', 'runtime', 'runtime-minmax',
       'runtime-sigm', 'first-lang', 'first-lang-enc', 'second-lang',
       'second-lang-enc', 'directors', 'first-dir', 'first-dir-enc',
       'second-dir', 'second-dir-enc', 'countries', 'first-country',
       'first-country-enc', 'second-country', 'second-country-enc',
       'third-country', 'third-country-enc', 'rating', 'rating-enc',
       'imdb-score'],
      dtype='object')

# resyserzy, kraje, rating

In [2298]:
fig = px.scatter(netflix_disp,
    x='imdb-score', y='first-dir',
    orientation='h',
    color='second-dir',
    hover_data=['title', 'first-dir', 'second-dir', 'imdb-score'],
    height=1150,
    title='IMDB Score by directors'
)
fig.update_traces(marker=dict(size=6, line=dict(width=1)))
fig.update_layout(legend_title='Co-directors')
fig.show()

In [2299]:
fig = px.scatter(netflix_disp,
    y='imdb-score', x='first-country',
    color='second-country',
    symbol='third-country',
    hover_data=['title', 'first-country', 'second-country', 'third-country', 'imdb-score'],
    height=1150,
    title='IMDB Score by countries'
)
fig.update_traces(marker=dict(size=6, line=dict(width=1)))
fig.update_layout(legend_title='Countries (2nd, 3rd)')
fig.show()

In [2300]:
# fig = px.box(netflix_disp,
#     x='imdb-score', y='first-country',
#     orientation='h',
#     color='second-country',
#     hover_data=['title', 'first-country', 'second-country', 'third-country', 'imdb-score'],
#     height=1150,
#     title='IMDB Score by countries'
# )
# fig.update_traces(marker=dict(size=6, line=dict(width=1)))
# fig.update_layout(legend_title='Countries (1st, 2nd)')
# fig.show()

In [2301]:
fig = px.box(netflix_disp,
    y='imdb-score', x='rating',
    hover_data=['title', 'rating', 'imdb-score'],
    height=600,
    title='IMDB Score by rating'
)
fig.update_traces(marker=dict(size=6, line=dict(width=1)))
fig.show()

In [2326]:
fig = px.scatter(netflix_disp,
    x='first-dir', y='first-genre',
    orientation='h',
    color='second-genre',
    symbol='second-dir',
    hover_data=['title', 'first-genre', 'second-genre', 'first-dir', 'second-dir', 'imdb-score'],
    height=1150,
    title='Genres by directors'
)
fig.update_traces(marker=dict(size=6, line=dict(width=1)))
fig.update_layout(legend_title='2nd genre, 2nd director')
fig.show()

In [2303]:
X = netflix_all.drop(
    ['title', 'first-genre', 'second-genre', 'third-genre', 'premiere-year-q',
    'first-lang', 'second-lang', 'directors', 'first-dir', 'second-dir', 'countries',
    'first-country', 'second-country', 'third-country', 'rating', 'imdb-score'], axis=1)
y = netflix_all['imdb-score']
X.columns

Index(['first-genre-enc', 'second-genre-enc', 'third-genre-enc',
       'premiere-year', 'premiere-year-q-enc', 'runtime', 'runtime-minmax',
       'runtime-sigm', 'first-lang-enc', 'second-lang-enc', 'first-dir-enc',
       'second-dir-enc', 'first-country-enc', 'second-country-enc',
       'third-country-enc', 'rating-enc'],
      dtype='object')

In [2304]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
y_test.name = 'true imdb-score'

In [2305]:
regressor = RandomForestRegressor(n_estimators=100, random_state=0)
regressor.fit(X_train, y_train)
prediction = regressor.predict(X_test)
RMSE = np.sqrt(mean_squared_error(y_test, prediction))

pred_ser = pd.Series(prediction, name='predicted imdb-score')
pred_ser.index = y_test.index

fig = go.Figure((go.Scatter(x=y_test.index, y=y_test, mode='markers', name='true imdb-score')))
fig.add_trace(go.Scatter(x=pred_ser.index, y=pred_ser, mode='markers', name='predicted imdb-score'))
fig.update_layout(title=f'Prediction and true imdb-score with added columns, test_size=0.2, estimators=100, {RMSE=}')
fig.show()

In [2306]:
regressor = RandomForestRegressor(n_estimators=200, random_state=0)
regressor.fit(X_train, y_train)
prediction = regressor.predict(X_test)
RMSE = np.sqrt(mean_squared_error(y_test, prediction))

pred_ser = pd.Series(prediction, name='predicted imdb-score')
pred_ser.index = y_test.index

fig = go.Figure((go.Scatter(x=y_test.index, y=y_test, mode='markers', name='true imdb-score')))
fig.add_trace(go.Scatter(x=pred_ser.index, y=pred_ser, mode='markers', name='predicted imdb-score'))
fig.update_layout(title=f'Prediction and true imdb-score with added columns, test_size=0.2, estimators=200, {RMSE=}')
fig.show()

In [2307]:
regressor = RandomForestRegressor(n_estimators=250, random_state=0)
regressor.fit(X_train, y_train)
prediction = regressor.predict(X_test)
RMSE = np.sqrt(mean_squared_error(y_test, prediction))

pred_ser = pd.Series(prediction, name='predicted imdb-score')
pred_ser.index = y_test.index

fig = go.Figure((go.Scatter(x=y_test.index, y=y_test, mode='markers', name='true imdb-score')))
fig.add_trace(go.Scatter(x=pred_ser.index, y=pred_ser, mode='markers', name='predicted imdb-score'))
fig.update_layout(title=f'Prediction and true imdb-score with added columns, test_size=0.2, estimators=250, {RMSE=}')
fig.show()

In [2308]:
regressor = RandomForestRegressor(n_estimators=500, random_state=0)
regressor.fit(X_train, y_train)
prediction = regressor.predict(X_test)
RMSE = np.sqrt(mean_squared_error(y_test, prediction))

pred_ser = pd.Series(prediction, name='predicted imdb-score')
pred_ser.index = y_test.index

fig = go.Figure((go.Scatter(x=y_test.index, y=y_test, mode='markers', name='true imdb-score')))
fig.add_trace(go.Scatter(x=pred_ser.index, y=pred_ser, mode='markers', name='predicted imdb-score'))
fig.update_layout(title=f'Prediction and true imdb-score with added columns, test_size=0.2, estimators=500, {RMSE=}')
fig.show()

In [2309]:
rf_tuning = RandomForestRegressor(random_state=0)
param_grid = {
    'n_estimators': [100, 200, 250, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 5, 6, 7],
    'criterion': ['mse']
}
GSCV = GridSearchCV(estimator=rf_tuning, param_grid=param_grid, cv=5, n_jobs=8)
GSCV.fit(X_train, y_train)
GSCV.best_params_

{'criterion': 'mse',
 'max_depth': 7,
 'max_features': 'sqrt',
 'n_estimators': 500}

In [2310]:
rf = RandomForestRegressor(random_state=0, n_estimators=500,
    criterion='mse', max_features='sqrt', max_depth=7)
rf.fit(X_train, y_train)
prediction = rf.predict(X_test)
RMSE = np.sqrt(mean_squared_error(y_test, prediction))

pred_ser = pd.Series(prediction, name='predicted imdb-score')
pred_ser.index = y_test.index

fig = go.Figure((go.Scatter(x=y_test.index, y=y_test, mode='markers', name='true imdb-score')))
fig.add_trace(go.Scatter(x=pred_ser.index, y=pred_ser, mode='markers', name='predicted imdb-score'))
fig.update_layout(
    title=f'Prediction and true imdb-score with GridSearchCV parameters:<br>\
[criterion:"mse", max_depth:7, max_features:"sqrt", n_estimators:500],<br>\
test_size=0.2, estimators=500, {RMSE=}')
fig.show()

In [2311]:
sel = SelectFromModel(rf)
sel.fit(X_train, y_train)
selected_features = X_train.columns[(sel.get_support())]
print(selected_features)

Index(['first-genre-enc', 'runtime', 'runtime-minmax', 'runtime-sigm',
       'first-dir-enc', 'first-country-enc'],
      dtype='object')


In [2312]:
for i, v in enumerate(rf.feature_importances_):
    print(X_train.columns[i], '=', v)

first-genre-enc = 0.21555976063167165
second-genre-enc = 0.041122595861421064
third-genre-enc = 0.003927230217964963
premiere-year = 0.04195901396042643
premiere-year-q-enc = 0.035173647423937665
runtime = 0.09307843544116864
runtime-minmax = 0.09375852164281956
runtime-sigm = 0.09276412535737621
first-lang-enc = 0.04812062838308644
second-lang-enc = 0.016590996821489557
first-dir-enc = 0.10582424100728925
second-dir-enc = 0.040244670689839876
first-country-enc = 0.09604545873892424
second-country-enc = 0.022400031137420055
third-country-enc = 0.0021146380809158017
rating-enc = 0.0513160046042485


In [2313]:
X = netflix_all[[
    'first-genre-enc', 'premiere-year', 'runtime',
    'runtime-minmax', 'runtime-sigm', 'first-dir-enc',
    'first-country-enc']]
y = netflix_all['imdb-score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [2314]:
rf = RandomForestRegressor(random_state=0, n_estimators=500,
    criterion='mse', max_features='sqrt', max_depth=7)
rf.fit(X_train, y_train)
prediction = rf.predict(X_test)
RMSE = np.sqrt(mean_squared_error(y_test, prediction))

pred_ser = pd.Series(prediction, name='predicted imdb-score')
pred_ser.index = y_test.index

In [2327]:
fig = go.Figure((go.Scatter(x=y_test.index, y=y_test, mode='markers', name='true imdb-score')))
fig.add_trace(go.Scatter(x=pred_ser.index, y=pred_ser, mode='markers', name='predicted imdb-score'))
fig.update_layout(
    title=f'Prediction and true imdb-score, CV, attributes:<br>\
[first-genre-enc, premiere-year, runtime, runtime-minmax, runtime-sigm, first-dir-enc, first-country-enc],<br>\
test_size=0.2, estimators=500, {RMSE=}')
fig.show()

In [2316]:
X = netflix_all[['first-genre-enc']]
y = netflix_all['imdb-score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = LinearRegression()
model.fit(X_train, y_train)
prediction = model.predict(X_test)

pred_ser = pd.Series(prediction, name='predicted imdb-score')
pred_ser.index = y_test.index
RMSE = np.sqrt(mean_squared_error(y_test, prediction))

fig = go.Figure((go.Scatter(x=y_test.index, y=y_test, mode='markers', name='true imdb-score')))
fig.add_trace(go.Scatter(x=pred_ser.index, y=pred_ser, mode='markers', name='predicted imdb-score'))
fig.update_layout(title=f'Linear Regression by first-genre-enc, {RMSE=}')
fig.show()

In [2317]:
# netflix_all.columns

In [2318]:
lmdf = netflix_all[['first-genre-enc', 'imdb-score']]
lmdf.rename({'first-genre-enc':'first_genre_enc', 'imdb-score':'imdb_score'}, axis=1, inplace=True)
lmStats = sfa.ols(data=lmdf, formula='imdb_score ~ first_genre_enc')
result = lmStats.fit()
result.params



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Intercept                5.678947
first_genre_enc[T.1]     0.354386
first_genre_enc[T.10]    0.361053
first_genre_enc[T.11]    1.264880
first_genre_enc[T.12]    0.665924
first_genre_enc[T.13]    0.181053
first_genre_enc[T.14]    0.371053
first_genre_enc[T.15]    1.371053
first_genre_enc[T.16]   -0.298947
first_genre_enc[T.17]    0.633553
first_genre_enc[T.18]    1.454386
first_genre_enc[T.19]    0.554386
first_genre_enc[T.2]     1.071053
first_genre_enc[T.20]    0.221053
first_genre_enc[T.21]   -0.078947
first_genre_enc[T.22]    0.421053
first_genre_enc[T.23]   -0.338947
first_genre_enc[T.24]   -0.092836
first_genre_enc[T.25]    0.421053
first_genre_enc[T.26]    1.061053
first_genre_enc[T.27]   -0.153947
first_genre_enc[T.3]     1.113910
first_genre_enc[T.4]     0.471053
first_genre_enc[T.5]     0.761053
first_genre_enc[T.6]     0.901053
first_genre_enc[T.7]     0.066622
first_genre_enc[T.8]     1.954386
first_genre_enc[T.9]     0.464803
dtype: float64

In [2319]:
model.score(pd.DataFrame(lmdf.first_genre_enc), lmdf.imdb_score), result.rsquared

(0.025380582709884192, 0.3215546098385408)

In [2320]:
y_test.name = 'imdb_score'
X_test.rename({'first-genre-enc':'first_genre_enc'}, axis=1, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [2321]:
prediction = result.predict(X_test)
pred_ser = pd.Series(prediction, name='predicted imdb-score')
pred_ser.index = y_test.index
RMSE = np.sqrt(mean_squared_error(y_test, prediction))

fig = go.Figure((go.Scatter(x=y_test.index, y=y_test, mode='markers', name='true imdb-score')))
fig.add_trace(go.Scatter(x=pred_ser.index, y=pred_ser, mode='markers', name='predicted imdb-score'))
fig.update_layout(title=f'Linear Regression OLS model on first-genre-enc, {RMSE=}')
fig.show()