In [223]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

In [224]:
!pwd

/home/s26549/sharedfolder/wlasne_projekty/netflix_classification


In [225]:
netflix = pd.read_csv('NetflixOriginals.csv', sep=',', encoding='Windows-1252')
netflix.head()

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language
0,Enter the Anime,Documentary,"August 5, 2019",58,2.5,English/Japanese
1,Dark Forces,Thriller,"August 21, 2020",81,2.6,Spanish
2,The App,Science fiction/Drama,"December 26, 2019",79,2.6,Italian
3,The Open House,Horror thriller,"January 19, 2018",94,3.2,English
4,Kaali Khuhi,Mystery,"October 30, 2020",90,3.4,Hindi


In [226]:
cols_rename = {'Title': 'title', 'Genre': 'genre', 'Premiere': 'premiere',
    'Runtime': 'runtime', 'IMDB Score': 'imdb', 'Language': 'language'}
netflix.rename(cols_rename, axis=1, inplace=True)
netflix.columns

Index(['title', 'genre', 'premiere', 'runtime', 'imdb', 'language'], dtype='object')

In [227]:
# netflix['genre'].unique()

In [228]:
netflix[netflix['genre'].isna()]

Unnamed: 0,title,genre,premiere,runtime,imdb,language


In [229]:
genres_norm = {
    'Documentary': ['Documentary'],
    'Thriller': ['Thriller'],
    'Science fiction/Drama': ['Science fiction', 'Drama'],
    'Horror thriller': ['Horror', 'Thriller'],
    'Mystery': ['Crime'],
    'Action': ['Action'],
    'Comedy': ['Comedy'],
    'Heist film/Thriller': ['Crime', 'Thriller'],
    'Musical/Western/Fantasy': ['Musical', 'Western', 'Fantasy'],
    'Drama': ['Drama'],
    'Romantic comedy': ['Romantic', 'Comedy'],
    'Action comedy': ['Action', 'Comedy'],
    'Horror anthology': ['Horror'],
    'Political thriller': ['Thriller'],
    'Superhero-Comedy': ['Superhero', 'Comedy'],
    'Horror': ['Horror'],
    'Romance drama': ['Romantic', 'Drama'],
    'Anime / Short': ['Anime', 'Short'],
    'Superhero': ['Superhero'],
    'Heist': ['Crime'],
    'Western': ['Western'],
    'Animation/Superhero': ['Animation', 'Superhero'],
    'Family film': ['Family'],
    'Action-thriller': ['Action', 'Thriller'],
    'Teen comedy-drama': ['Comedy', 'Drama'],
    'Romantic drama': ['Romantic', 'Drama'],
    'Animation': ['Animation'],
    'Aftershow / Interview': ['Aftershow interview'],
    'Christmas musical': ['Christmas', 'Musical'],
    'Science fiction adventure': ['Science fiction', 'Adventure'],
    'Science fiction': ['Science fiction'],
    'Variety show': ['Variety show'],
    'Comedy-drama': ['Comedy', 'Drama'],
    'Comedy/Fantasy/Family': ['Comedy', 'Fantasy', 'Family'],
    'Supernatural drama': ['Horror', 'Drama'],
    'Action/Comedy': ['Action', 'Comedy'],
    'Action/Science fiction': ['Action', 'Science fiction'],
    'Romantic teenage drama': ['Romantic', 'Drama'],
    'Comedy / Musical': ['Comedy', 'Musical'],
    'Musical': ['Musical'],
    'Science fiction/Mystery': ['Science fiction', 'Crime'],
    'Crime drama': ['Crime', 'Drama'],
    'Psychological thriller drama': ['Psychological', 'Thriller', 'Drama'],
    'Adventure/Comedy': ['Adventure', 'Comedy'],
    'Black comedy': ['Comedy'],
    'Romance': ['Romantic'],
    'Horror comedy': ['Horror', 'Comedy'],
    'Christian musical': ['Musical'],
    'Romantic teen drama': ['Romantic', 'Drama'],
    'Family': ['Family'],
    'Dark comedy': ['Comedy'],
    'Comedy horror': ['Comedy', 'Horror'],
    'Psychological thriller': ['Psychological', 'Thriller'],
    'Biopic': ['Biographical'],
    'Science fiction/Thriller': ['Science fiction', 'Thriller'],
    'Mockumentary': ['Comedy'],
    'Satire': ['Comedy'],
    'One-man show': ['One-man show'],
    'Romantic comedy-drama': ['Romantic', 'Comedy', 'Drama'],
    'Comedy/Horror': ['Comedy'],
    'Fantasy': ['Fantasy'],
    'Sports-drama': ['Sport', 'Drama'],
    'Zombie/Heist': ['Horror', 'Crime'],
    'Psychological horror': ['Psychological', 'Horror'],
    'Sports film': ['Sport'],
    'Comedy mystery': ['Comedy', 'Crime'],
    'Romantic thriller': ['Romantic', 'Thriller'],
    'Christmas comedy': ['Christmas', 'Comedy'],
    'War-Comedy': ['War', 'Comedy'],
    'Romantic comedy/Holiday': ['Romantic', 'Comedy'],
    'Adventure-romance': ['Adventure', 'Romantic'],
    'Adventure': ['Adventure'],
    'Horror-thriller': ['Horror', 'Thriller'],
    'Dance comedy': ['Comedy'],
    'Stop Motion': ['Animation'],
    'Horror/Crime drama': ['Horror', 'Crime', 'Drama'],
    'Urban fantasy': ['Fantasy'],
    'Drama/Horror': ['Drama', 'Horror'],
    'Family/Comedy-drama': ['Family', 'Comedy', 'Drama'],
    'War': ['War'],
    'Crime thriller': ['Crime', 'Thriller'],
    'Science fiction/Action': ['Science fiction', 'Action'],
    'Teen comedy horror': ['Comedy', 'Horror'],
    'Concert Film': ['Concert'],
    'Musical comedy': ['Musical', 'Comedy'],
    'Animation/Musical/Adventure': ['Animation', 'Musical', 'Adventure'],
    'Animation / Musicial': ['Animation', 'Musical'],
    'Animation/Comedy/Adventure': ['Animation', 'Comedy', 'Adventure'],
    'Action thriller': ['Action', 'Thriller'],
    'Anime/Science fiction': ['Anime', 'Science fiction'],
    'Animation / Short': ['Animation', 'Short'],
    'War drama': ['War', 'Drama'],
    'Family/Christmas musical': ['Family', 'Christmas', 'Musical'],
    'Science fiction thriller': ['Science fiction', 'Thriller'],
    'Drama / Short': ['Drama', 'Short'],
    'Hidden-camera prank comedy': ['Comedy'],
    'Spy thriller': ['Thriller'],
    'Anime/Fantasy': ['Anime', 'Fantasy'],
    'Animated musical comedy': ['Animation', 'Musical', 'Comedy'],
    'Variety Show': ['Variety show'],
    'Superhero/Action': ['Superhero', 'Action'],
    'Biographical/Comedy': ['Biographical', 'Comedy'],
    'Historical-epic': ['Historical'],
    'Animation / Comedy': ['Animation', 'Comedy'],
    'Christmas/Fantasy/Adventure/Comedy': ['Christmas', 'Fantasy', 'Adventure', 'Comedy'],
    'Mentalism special': ['Documentary', 'Thriller'],
    'Drama-Comedy': ['Drama', 'Comedy'],
    'Coming-of-age comedy-drama': ['Comedy', 'Drama'],
    'Historical drama': ['Historical', 'Drama'],
    'Making-of': ['Documentary'],
    'Action-adventure': ['Action', 'Adventure'],
    'Animation / Science Fiction': ['Animation', 'Science fiction'],
    'Anthology/Dark comedy': ['Comedy'],
    'Musical / Short': ['Musical', 'Short'],
    'Animation/Christmas/Comedy/Adventure': ['Animation', 'Christmas', 'Comedy', 'Adventure']
}

new_genres = list()
for new_genre in genres_norm.values():
    new_genres.append('/'.join(new_genre))

genres_to_change = dict(zip(genres_norm.keys(), new_genres))
netflix['genre'] = netflix['genre'].map(genres_to_change)
netflix['genre']

0                Documentary
1                   Thriller
2      Science fiction/Drama
3            Horror/Thriller
4                      Crime
               ...          
579                  Concert
580              Documentary
581             One-man show
582              Documentary
583              Documentary
Name: genre, Length: 584, dtype: object

In [230]:
netflix[netflix['genre'].isna()]

Unnamed: 0,title,genre,premiere,runtime,imdb,language


In [231]:
genres = list()
for values in genres_norm.values():
    for val in values:
        genres.append(val)

In [232]:
genres_occ = dict(zip(genres, np.zeros(len(genres))))

for values in netflix['genre']:
    values = values.split('/')
    for val in values:
        val = val.strip()
        if val in genres:
            genres_occ[val] += 1

sorted(genres_occ.items(), key=lambda x:x[1], reverse=True)

[('Documentary', 162.0),
 ('Comedy', 145.0),
 ('Drama', 129.0),
 ('Thriller', 59.0),
 ('Romantic', 57.0),
 ('Horror', 23.0),
 ('Action', 21.0),
 ('Crime', 20.0),
 ('Science fiction', 19.0),
 ('Animation', 18.0),
 ('Musical', 14.0),
 ('Adventure', 10.0),
 ('Biographical', 10.0),
 ('Short', 7.0),
 ('Family', 7.0),
 ('Fantasy', 6.0),
 ('Aftershow interview', 6.0),
 ('Psychological', 6.0),
 ('Concert', 6.0),
 ('Superhero', 5.0),
 ('Christmas', 5.0),
 ('Variety show', 5.0),
 ('War', 5.0),
 ('Western', 4.0),
 ('Anime', 4.0),
 ('Sport', 4.0),
 ('One-man show', 3.0),
 ('Historical', 2.0)]

In [233]:
other_genres = [k for k, v in genres_occ.items() if v < 10]
print(other_genres)
other_genre_count = sum({k:v for k, v in genres_occ.items() if v < 10}.values())

genres_occ = {k:v for k, v in genres_occ.items() if v >= 10}
genres_occ['Other genre'] = other_genre_count
genres_occ

['Western', 'Fantasy', 'Superhero', 'Anime', 'Short', 'Family', 'Aftershow interview', 'Christmas', 'Variety show', 'Psychological', 'One-man show', 'Sport', 'War', 'Concert', 'Historical']


{'Documentary': 162.0,
 'Thriller': 59.0,
 'Science fiction': 19.0,
 'Drama': 129.0,
 'Horror': 23.0,
 'Crime': 20.0,
 'Action': 21.0,
 'Comedy': 145.0,
 'Musical': 14.0,
 'Romantic': 57.0,
 'Animation': 18.0,
 'Adventure': 10.0,
 'Biographical': 10.0,
 'Other genre': 75.0}

In [234]:
clean_genres = list()
for genres in netflix['genre']:
    genres = genres.split('/')
    cur_genres = ''

    for genre in genres:
        cur_genres += 'Other genre' if genre not in genres_occ.keys() else genre
        cur_genres += '/'
    
    cur_genres = cur_genres[:-1]
    clean_genres.append(cur_genres)

netflix['genre'] = clean_genres
netflix['genre'].unique()

array(['Documentary', 'Thriller', 'Science fiction/Drama',
       'Horror/Thriller', 'Crime', 'Action', 'Comedy', 'Crime/Thriller',
       'Musical/Other genre/Other genre', 'Drama', 'Romantic/Comedy',
       'Action/Comedy', 'Horror', 'Other genre/Comedy', 'Romantic/Drama',
       'Other genre/Other genre', 'Other genre', 'Animation/Other genre',
       'Action/Thriller', 'Comedy/Drama', 'Animation',
       'Other genre/Musical', 'Science fiction/Adventure',
       'Science fiction', 'Comedy/Other genre/Other genre',
       'Horror/Drama', 'Action/Science fiction', 'Comedy/Musical',
       'Musical', 'Science fiction/Crime', 'Crime/Drama',
       'Other genre/Thriller/Drama', 'Adventure/Comedy', 'Romantic',
       'Horror/Comedy', 'Comedy/Horror', 'Other genre/Thriller',
       'Biographical', 'Science fiction/Thriller',
       'Romantic/Comedy/Drama', 'Other genre/Drama', 'Horror/Crime',
       'Other genre/Horror', 'Comedy/Crime', 'Romantic/Thriller',
       'Adventure/Romantic', 'A

In [236]:
for genre in genres_occ.keys():
    cur_genre_values = list()
    for genres in netflix['genre']:
        cur_genre_values.append(1 if genre in genres else 0)
    netflix[genre] = cur_genre_values

netflix.drop('genre', axis=1, inplace=True)
netflix

Unnamed: 0,title,premiere,runtime,imdb,language,Documentary,Thriller,Science fiction,Drama,Horror,Crime,Action,Comedy,Musical,Romantic,Animation,Adventure,Biographical,Other genre
0,Enter the Anime,"August 5, 2019",58,2.5,English/Japanese,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Dark Forces,"August 21, 2020",81,2.6,Spanish,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,The App,"December 26, 2019",79,2.6,Italian,0,0,1,1,0,0,0,0,0,0,0,0,0,0
3,The Open House,"January 19, 2018",94,3.2,English,0,1,0,0,1,0,0,0,0,0,0,0,0,0
4,Kaali Khuhi,"October 30, 2020",90,3.4,Hindi,0,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
579,Taylor Swift: Reputation Stadium Tour,"December 31, 2018",125,8.4,English,0,0,0,0,0,0,0,0,0,0,0,0,0,1
580,Winter on Fire: Ukraine's Fight for Freedom,"October 9, 2015",91,8.4,English/Ukranian/Russian,1,0,0,0,0,0,0,0,0,0,0,0,0,0
581,Springsteen on Broadway,"December 16, 2018",153,8.5,English,0,0,0,0,0,0,0,0,0,0,0,0,0,1
582,Emicida: AmarElo - It's All For Yesterday,"December 8, 2020",89,8.6,Portuguese,1,0,0,0,0,0,0,0,0,0,0,0,0,0
