## Data import

In [1]:
import pandas as pd
from tqdm.notebook import tqdm_notebook, tqdm
import numpy as np
from cleverminer import cleverminer

Cleverminer version  1.0.2  educational version. For production use (bugfixes and keeping compatibility), ask for PRO version.


In [2]:
# movies
movies = pd.read_csv('./data/rotten_tomatoes_movies.csv', index_col = 'rotten_tomatoes_link')

In [3]:
# rotten_tomatoes_critic_reviews was too big it had to be split to be uploaded to GITHub
filenames = ['rotten_tomatoes_critic_reviews.csv', 'rotten_tomato_reviews_1.csv', 'rotten_tomato_reviews_2.csv', 'rotten_tomato_reviews_3.csv', 'rotten_tomato_reviews_4.csv', 'rotten_tomato_reviews_5.csv']
reviews_list = []

for filename in filenames:
    review = pd.read_csv(f'./data/{filename}', index_col='rotten_tomatoes_link')
    reviews_list.append(review)

# combining the critics review dataset back together
reviews = pd.concat(reviews_list)

## Data preprocessing

### movies

In [4]:
# deleting useless columna movies_info, critics_consensus containing long strings
movies.drop('movie_info', axis=1, inplace=True)
movies.drop('critics_consensus', axis=1, inplace=True)

### reviews

In [5]:
# dropping rows with missing review_score values
reviews = reviews[reviews['review_score'].notna()]

# deleting useless column review_cotent containing long strings
reviews = reviews.drop('review_content', axis=1)

In [6]:
#unique values in review_score 
# unq = reviews['review_score'].unique()
# unq.sort()
# unq

#moje poznamky co napsat do zpravy
#FILTROVANI DAT NA ZAKLADE CRITICS_SCORE
# DLE RAUCHA JE TO OK..
# --- vime o tom a vysledek muze byt timto zpusobem zkreslen

In [7]:
# deleting reviews with non-numeric review scores (approx 80 000 values)
reviews = reviews[reviews['review_score'].str.contains('[A-Za-z]') == False]

# splitting review scores into columns
split = reviews['review_score'].str.split('/')

 #calculating percentage scores
scores =  pd.to_numeric(split.str.get(0)) / pd.to_numeric(split.str.get(1))
scores = scores*100

# assigning calculated values to the reviews scores column
reviews['review_score'] = scores

## Final dataset

In [8]:
#Merging the two datasets - reviews and movies
data = pd.merge(movies, reviews, how='inner', on = 'rotten_tomatoes_link')

In [9]:
data['actors'] = data['actors'].str.split(', ')

# remove rows with missing values
data = data[data['actors'].isnull() == False]
data = data[data['review_score'].isnull() == False]
data = data[np.isinf(data['review_score']) == False]

In [None]:
## Rauch říkal, že by bylo moc práce to rozdělovat, pro naší SP stačí použít kategorie tak jak jsou...
## do zpravy je treba napsat, ze nam to takto rekl
## /--- se souhlasem cviciho
## muzeme pouzit kategorie tak jak jsou

# 

In [21]:
data['genres'] = data['genres'].str.split(', ')
data = data[data['genres'].isnull() == False]

### New columns

In [13]:
# new column with review score intervals
def review_score_interval(row):
    if row['review_score'] >= 90:
        return '>90'
    elif row['review_score'] >= 70:
        return '70-90'
    elif row['review_score'] >= 50:
        return '50-70'
    elif row['review_score'] >= 30:
        return '30-50'
    else:
        return '<30'

data["review_score_interval"] = data.apply(review_score_interval, axis=1)

In [14]:
# new column with tomatometer rating interval
def tomatometer_rating_interval(row):
    if row['tomatometer_rating'] >= 90:
        return '>90'
    elif row['tomatometer_rating'] >= 70:
        return '70-90'
    elif row['tomatometer_rating'] >= 50:
        return '50-70'
    elif row['tomatometer_rating'] >= 30:
        return '30-50'
    else:
        return '<30'

data["tomatometer_rating_interval"] = data.apply(tomatometer_rating_interval, axis=1)

In [15]:
# new column with runtime intervals
def runtime_interval(row):
    if row['runtime'] >= 150:
        return '>2.5h'
    elif row['runtime'] >= 120:
        return '2-2.5'
    elif row['runtime'] >= 105:
        return '1.75-2h'
    elif row['runtime'] >= 90:
        return '1.5-1.75h'
    else:
        return '<1.5h'

data["runtime_interval"] = data.apply(runtime_interval, axis=1)

In [85]:
def is_horror(row):
    if 'Horror' in row['genres']:
        return True
    else: return False
data["Horror"] = data.apply(is_horror, axis=1)
    

In [56]:
pd.set_option('display.max_columns', None)
data.head(1)

Unnamed: 0_level_0,movie_title,content_rating,genres,directors,authors,actors,original_release_date,streaming_release_date,runtime,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_score_interval,tomatometer_rating_interval,runtime_interval,release_decade_interval
rotten_tomatoes_link,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
m/0814255,Percy Jackson & the Olympians: The Lightning T...,PG,"[Action & Adventure, Comedy, Drama, Science Fi...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","[Logan Lerman, Brandon T. Jackson, Alexandra D...",2010-02-12,2015-11-25,119.0,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76,Ben McEachen,False,Sunday Mail (Australia),Fresh,70.0,2010-02-09,70-90,30-50,1.75-2h,2010's


## top genres

In [84]:
genres_scores = dict()
for row in tqdm(range(data.shape[0])):
    value = data['review_score'][row]
    for genre in data['genres'][row]:
        if genre not in genres_scores.keys():
            genres_scores[genre] = [value]
        else:
            genres_scores[genre].append(value)

  0%|          | 0/675528 [00:00<?, ?it/s]

In [82]:
# calculating average scores for each actor
final_genres_scores = {}
for key in tqdm(genres_scores.keys()):
    # add only actors with 3000+ reviews 
    if len(genres_scores[key]) > 100:
        mean = np.mean(genres_scores[key])
        final_genres_scores[key] = round(mean, 1)

  0%|          | 0/21 [00:00<?, ?it/s]

In [83]:
final_genres_scores
top_genres = sorted(final_genres_scores.items(), key=lambda x: x[1], reverse=True)
top_genres

[('Classics', 75.7),
 ('Documentary', 71.5),
 ('Special Interest', 70.4),
 ('Anime & Manga', 69.9),
 ('Art House & International', 69.5),
 ('Sports & Fitness', 67.4),
 ('Animation', 66.8),
 ('Television', 66.8),
 ('Musical & Performing Arts', 66.3),
 ('Western', 66.1),
 ('Drama', 65.7),
 ('Cult Movies', 65.1),
 ('Gay & Lesbian', 63.8),
 ('Science Fiction & Fantasy', 62.6),
 ('Romance', 62.5),
 ('Kids & Family', 62.0),
 ('Faith & Spirituality', 61.8),
 ('Mystery & Suspense', 61.4),
 ('Action & Adventure', 60.7),
 ('Comedy', 60.2),
 ('Horror', 57.3)]

## TOP10 actors

In [10]:
# creating dict with following annotation actor: [scores]
actor_scores = dict()
for row in tqdm(range(data.shape[0])):
    value = data['review_score'][row]
    for actor in data['actors'][row]:
        if actor not in actor_scores.keys():
            actor_scores[actor] = [value]
        else:
            actor_scores[actor].append(value)

  0%|          | 0/687766 [00:00<?, ?it/s]

In [11]:
# calculating average scores for each actor
final_scores = {}
for key in tqdm(actor_scores.keys()):
    
    # add only actors with 3000+ reviews 
    if len(actor_scores[key]) > 3000:
        mean = np.mean(actor_scores[key])
        final_scores[key] = round(mean, 1)

  0%|          | 0/203808 [00:00<?, ?it/s]

In [12]:
top10 = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)
top10[0:10]

[('John Ratzenberger', 78.6),
 ('Michael Stuhlbarg', 75.2),
 ('Andy Serkis', 75.0),
 ('Adam Driver', 74.6),
 ('Leonardo DiCaprio', 73.3),
 ('Anthony Daniels', 73.1),
 ('Domhnall Gleeson', 72.5),
 ('Warwick Davis', 72.4),
 ('Tom Holland (II)', 72.3),
 ('Tilda Swinton', 72.0)]

## Otazka 7 - neni nic moc :D

In [49]:
df = data
df=df.loc[:, ('critic_name','top_critic', 'review_score_interval')]



clm = cleverminer(df=df,proc='4ftMiner',
               quantifiers= {'conf':0.6, 'Base': 50},
               ante ={
                    'attributes':[
                        {'name': 'top_critic', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'critic_name', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':2, 'maxlen':2, 'type':'con'},
               succ ={
                    'attributes':[
                        {'name': 'review_score_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':1, 'maxlen':1, 'type':'con'}
               )

# print(clm.result)
clm.print_summary()
clm.print_rulelist()
clm.print_rule(2)

Cleverminer version 1.0.2. Note: This version is for personal and educational use only. If you need PRO version (support, fixing structures for compactibility in future versions for production deployment, additional development, licensing of commercial use of subroutines used), feel free to ask authors. Most of these functionalities are maintained in best-effort, as soon as this project is at given conditions for free use and rapid development is needed, they cannot be guaranteed.
Starting data preparation ...
Encoding columns into bit-form...
Encoding columns into bit-form...done
Data preparation finished.
Will go for  4ftMiner
Starting to mine rules.
Done. Total verifications : 2695, rules 31,control number:0, times: prep 25.931563138961792, processing 72.89253401756287

CleverMiner task processing summary:

Task type : 4ftMiner
Number of verifications : 2695
Number of rules : 31
Total time needed : 00h 01m 38s
Time of data preparation : 00h 00m 25s
Time of rule mining : 00h 01m 12s


## Otazka 3

In [38]:
# bezi cca 2m 24s

df = data
df=df.loc[:, ('release_decade_interval', 'review_score_interval', 'genres', 'production_company')]



clm = cleverminer(df=df,proc='4ftMiner',
               quantifiers= {'conf':0.6, 'Base': 50},
               ante ={
                    'attributes':[
                        {'name': 'production_company', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'genres', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'release_decade_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':3, 'maxlen':3, 'type':'con'},
               succ ={
                    'attributes':[
                        {'name': 'review_score_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':1, 'maxlen':1, 'type':'con'}
               )

# print(clm.result)
clm.print_summary()
clm.print_rulelist()
clm.print_rule(2)

Cleverminer version 1.0.2. Note: This version is for personal and educational use only. If you need PRO version (support, fixing structures for compactibility in future versions for production deployment, additional development, licensing of commercial use of subroutines used), feel free to ask authors. Most of these functionalities are maintained in best-effort, as soon as this project is at given conditions for free use and rapid development is needed, they cannot be guaranteed.
Starting data preparation ...
Encoding columns into bit-form...
Encoding columns into bit-form...done
Data preparation finished.
Will go for  4ftMiner
Starting to mine rules.
Done. Total verifications : 2984, rules 279,control number:0, times: prep 16.278171062469482, processing 128.39154815673828

CleverMiner task processing summary:

Task type : 4ftMiner
Number of verifications : 2984
Number of rules : 279
Total time needed : 00h 02m 24s
Time of data preparation : 00h 00m 16s
Time of rule mining : 00h 02m 0

## Otázka 4

In [16]:
# new column with release interval
data = data[data['original_release_date'].notnull()]

def release_decade_interval(row):
    split = row["original_release_date"].split('-')
    year = split[0]
    interval =  str(pd.to_numeric(year)//10*10) + '\'s'
    return interval

data["release_decade_interval"] = data.apply(release_decade_interval, axis=1)

### 4ftMiner

In [88]:

df = data
df=df.loc[:, ('tomatometer_rating_interval', 'release_decade_interval', 'production_company')]



clm = cleverminer(df=df,proc='4ftMiner',
               quantifiers= {'conf':0.8, 'Base': 100},
               ante ={
                    'attributes':[
                        {'name': 'production_company', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'release_decade_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':2, 'maxlen':2, 'type':'con'},
               succ ={
                    'attributes':[
                        {'name': 'tomatometer_rating_interval', 'type': 'one', 'value': '>90'},
                    ], 'minlen':1, 'maxlen':1, 'type':'con'}
               )

# print(clm.result)
clm.print_summary()
clm.print_rulelist()
clm.print_rule(2)

Cleverminer version 1.0.2. Note: This version is for personal and educational use only. If you need PRO version (support, fixing structures for compactibility in future versions for production deployment, additional development, licensing of commercial use of subroutines used), feel free to ask authors. Most of these functionalities are maintained in best-effort, as soon as this project is at given conditions for free use and rapid development is needed, they cannot be guaranteed.
Starting data preparation ...
Encoding columns into bit-form...
Encoding columns into bit-form...done
Data preparation finished.
Will go for  4ftMiner
Starting to mine rules.
Done. Total verifications : 249, rules 50,control number:0, times: prep 15.51873254776001, processing 10.54705810546875

CleverMiner task processing summary:

Task type : 4ftMiner
Number of verifications : 249
Number of rules : 50
Total time needed : 00h 00m 26s
Time of data preparation : 00h 00m 15s
Time of rule mining : 00h 00m 10s


L

## Question X: Hledání souvislosti mezi režiséry a review score

### Jak se liší hodnocení filmů na základě režiséra, doby trvání a vhodnosti obsahu

In [30]:
# be patient, runs approx 50secs!
df = data
df=df.loc[:, ('directors', 'runtime_interval', 'content_rating', 'review_score_interval')]

clm = cleverminer(df=df,proc='4ftMiner',
               quantifiers= {'conf':0.8, 'Base': 50},
               ante ={
                    'attributes':[
                        {'name': 'directors', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'runtime_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'content_rating', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':2, 'maxlen':3, 'type':'con'},
               succ ={
                    'attributes':[
                        {'name': 'review_score_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':1, 'maxlen':1, 'type':'con'}
               )

clm.print_summary()
clm.print_rulelist()
clm.print_rule(2)

Cleverminer version 1.0.2. Note: This version is for personal and educational use only. If you need PRO version (support, fixing structures for compactibility in future versions for production deployment, additional development, licensing of commercial use of subroutines used), feel free to ask authors. Most of these functionalities are maintained in best-effort, as soon as this project is at given conditions for free use and rapid development is needed, they cannot be guaranteed.
Starting data preparation ...
Encoding columns into bit-form...
Encoding columns into bit-form...done
Data preparation finished.
Will go for  4ftMiner
Starting to mine rules.
Done. Total verifications : 9780, rules 30,control number:0, times: prep 28.036372900009155, processing 16.148449897766113

CleverMiner task processing summary:

Task type : 4ftMiner
Number of verifications : 9780
Number of rules : 30
Total time needed : 00h 00m 44s
Time of data preparation : 00h 00m 28s
Time of rule mining : 00h 00m 16s

## Question Y: Hledání souvislosti mezi kritiky a review score

### Jak se liší hodnocení filmů na základě kritika, zda je kritic top a  doby trvání

In [None]:
# be patient, runs approx 50secs!
df = data
df=df.loc[:, ('critic_name', 'runtime_interval', 'review_score', 'top_critic', 'review_score_interval')]

clm = cleverminer(df=df,proc='4ftMiner',
               quantifiers= {'conf':0.8, 'Base': 50},
               ante ={
                    'attributes':[
                        {'name': 'critic_name', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'runtime_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'top_critic', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':2, 'maxlen':3, 'type':'con'},
               succ ={
                    'attributes':[
                        {'name': 'review_score_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':1, 'maxlen':1, 'type':'con'}
               )

clm.print_summary()
clm.print_rulelist()
clm.print_rule(2)

## bonus: horror movies

In [66]:
horror_scores = dict()
for row in tqdm(range(data.shape[0])):
    if 'Horror' in data['genres'][row]:
        value = data['review_score'][row]
        name = data['movie_title'][row]
        if name not in horror_scores.keys():
            horror_scores[name] = [value]
        else:
            horror_scores[name].append(value)

  0%|          | 0/675528 [00:00<?, ?it/s]

In [76]:
# calculating average scores for each actor
final_horror_scores = {}
for key in tqdm(horror_scores.keys()):
    # add only actors with 3000+ reviews 
    if len(horror_scores[key]) > 50:
        mean = np.mean(horror_scores[key])
        final_horror_scores[key] = round(mean, 1)

  0%|          | 0/1777 [00:00<?, ?it/s]

In [89]:
top_horrors = sorted(final_horror_scores.items(), key=lambda x: x[1], reverse=True)

In [88]:
df = data
df=df.loc[:, ('Horror', 'runtime_interval', 'review_score_interval', 'tomatometer_rating_interval', 'movie_title', 'release_decade_interval', 'production_company')]




clm = cleverminer(df=df,proc='4ftMiner',
               quantifiers= {'conf':0.7, 'Base': 50},
               ante ={
                    'attributes':[
                        {'name': 'Horror', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'runtime_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'production_company', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':3, 'maxlen':3, 'type':'con'},
               succ ={
                    'attributes':[
                        {'name': 'review_score_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':1, 'maxlen':1, 'type':'con'}
               )

clm.print_summary()
clm.print_rulelist()
clm.print_rule(2)

Cleverminer version 1.0.2. Note: This version is for personal and educational use only. If you need PRO version (support, fixing structures for compactibility in future versions for production deployment, additional development, licensing of commercial use of subroutines used), feel free to ask authors. Most of these functionalities are maintained in best-effort, as soon as this project is at given conditions for free use and rapid development is needed, they cannot be guaranteed.
Starting data preparation ...
Encoding columns into bit-form...
Encoding columns into bit-form...done
Data preparation finished.
Will go for  4ftMiner
Starting to mine rules.
Done. Total verifications : 2099, rules 15,control number:0, times: prep 83.37931704521179, processing 76.39226531982422

CleverMiner task processing summary:

Task type : 4ftMiner
Number of verifications : 2099
Number of rules : 15
Total time needed : 00h 02m 39s
Time of data preparation : 00h 01m 23s
Time of rule mining : 00h 01m 16s



## Bonus: Nicolas Cage

In [21]:
def is_nicolas(row):
    if 'Nicolas Cage' in row['actors']:
        return True
    else: return False
data["nicolas cage"] = data.apply(is_nicolas, axis=1)

In [26]:
#JSOU FILMY S NICOLASEM CAGEM DOBRÉ?
df = data
df=df.loc[:, ('nicolas cage', 'runtime_interval', 'review_score', 'review_score_interval', 'tomatometer_rating')]




clm = cleverminer(df=df,proc='4ftMiner',
               quantifiers= {'conf':0.4, 'Base': 50},
               ante ={
                    'attributes':[
                        {'name': 'nicolas cage', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'runtime_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':2, 'maxlen':2, 'type':'con'},
               succ ={
                    'attributes':[
                        {'name': 'review_score_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1}
                    ], 'minlen':1, 'maxlen':1, 'type':'con'}
               )

clm.print_summary()
clm.print_rulelist()
clm.print_rule(2)

Cleverminer version 1.0.2. Note: This version is for personal and educational use only. If you need PRO version (support, fixing structures for compactibility in future versions for production deployment, additional development, licensing of commercial use of subroutines used), feel free to ask authors. Most of these functionalities are maintained in best-effort, as soon as this project is at given conditions for free use and rapid development is needed, they cannot be guaranteed.
Starting data preparation ...
Encoding columns into bit-form...
Encoding columns into bit-form...done
Data preparation finished.
Will go for  4ftMiner
Starting to mine rules.
Done. Total verifications : 43, rules 2,control number:0, times: prep 2.6506078243255615, processing 0.043886661529541016

CleverMiner task processing summary:

Task type : 4ftMiner
Number of verifications : 43
Number of rules : 2
Total time needed : 00h 00m 02s
Time of data preparation : 00h 00m 02s
Time of rule mining : 00h 00m 00s


L