## Data import

In [23]:
import pandas as pd
from tqdm.notebook import tqdm_notebook, tqdm
import numpy as np
from cleverminer import cleverminer

In [24]:
# movies
movies = pd.read_csv('../data/rotten_tomatoes_movies.csv', index_col = 'rotten_tomatoes_link')

In [25]:
# rotten_tomatoes_critic_reviews was too big it had to be split to be uploaded to GITHub
filenames = ['rotten_tomatoes_critic_reviews.csv', 'rotten_tomato_reviews_1.csv', 'rotten_tomato_reviews_2.csv', 'rotten_tomato_reviews_3.csv', 'rotten_tomato_reviews_4.csv', 'rotten_tomato_reviews_5.csv']
reviews_list = []

for filename in filenames:
    review = pd.read_csv(f'../data/{filename}', index_col='rotten_tomatoes_link')
    reviews_list.append(review)

# combining the critics review dataset back together
reviews = pd.concat(reviews_list)

## Data preprocessing

### movies

In [26]:
# deleting useless columna movies_info, critics_consensus containing long strings
movies.drop('movie_info', axis=1, inplace=True)
movies.drop('critics_consensus', axis=1, inplace=True)

### reviews

In [27]:
# dropping rows with missing review_score values
reviews = reviews[reviews['review_score'].notna()]

# deleting useless column review_cotent containing long strings
reviews = reviews.drop('review_content', axis=1)

In [28]:
#unique values in review_score 
# unq = reviews['review_score'].unique()
# unq.sort()
# unq

#moje poznamky co napsat do zpravy
#FILTROVANI DAT NA ZAKLADE CRITICS_SCORE
# DLE RAUCHA JE TO OK..
# --- vime o tom a vysledek muze byt timto zpusobem zkreslen

In [29]:
# deleting reviews with non-numeric review scores (approx 80 000 values)
reviews = reviews[reviews['review_score'].str.contains('[A-Za-z]') == False]

# splitting review scores into columns
split = reviews['review_score'].str.split('/')

 #calculating percentage scores
scores =  pd.to_numeric(split.str.get(0)) / pd.to_numeric(split.str.get(1))
scores = scores*100

# assigning calculated values to the reviews scores column
reviews['review_score'] = scores

## Final dataset

In [30]:
#Merging the two datasets - reviews and movies
data = pd.merge(movies, reviews, how='inner', on = 'rotten_tomatoes_link')

In [31]:
# data['actors'] = data['actors'].str.split(', ')

# remove rows with missing values
data = data[data['actors'].isnull() == False]
data = data[data['review_score'].isnull() == False]
data = data[np.isinf(data['review_score']) == False]
data = data[data['production_company'].isnull() == False]

In [32]:
## Rauch říkal, že by bylo moc práce to rozdělovat, pro naší SP stačí použít kategorie tak jak jsou...
## do zpravy je treba napsat, ze nam to takto rekl
## /--- se souhlasem cviciho
## muzeme pouzit kategorie tak jak jsou

# 

In [33]:
data['genres'] = data['genres'].str.split(', ')
data = data[data['genres'].isnull() == False]

### New columns

In [34]:
# new column with review score intervals
def review_score_interval(row):
    if row['review_score'] >= 90:
        return '>90'
    elif row['review_score'] >= 70:
        return '70-90'
    elif row['review_score'] >= 50:
        return '50-70'
    elif row['review_score'] >= 30:
        return '30-50'
    else:
        return '<30'

data["review_score_interval"] = data.apply(review_score_interval, axis=1)

In [35]:
# new column with review score intervals
def audience_rating_interval(row):
    if row['audience_rating'] >= 90:
        return '>90'
    elif row['audience_rating'] >= 70:
        return '70-90'
    elif row['audience_rating'] >= 50:
        return '50-70'
    elif row['audience_rating'] >= 30:
        return '30-50'
    else:
        return '<30'

data["audience_rating_interval"] = data.apply(audience_rating_interval, axis=1)

In [36]:
# new column with tomatometer rating interval
def tomatometer_rating_interval(row):
    if row['tomatometer_rating'] >= 90:
        return '>90'
    elif row['tomatometer_rating'] >= 70:
        return '70-90'
    elif row['tomatometer_rating'] >= 50:
        return '50-70'
    elif row['tomatometer_rating'] >= 30:
        return '30-50'
    else:
        return '<30'

data["tomatometer_rating_interval"] = data.apply(tomatometer_rating_interval, axis=1)

In [37]:
# new column with runtime intervals
def runtime_interval(row):
    if row['runtime'] >= 150:
        return '>2.5h'
    elif row['runtime'] >= 120:
        return '2-2.5'
    elif row['runtime'] >= 105:
        return '1.75-2h'
    elif row['runtime'] >= 90:
        return '1.5-1.75h'
    else:
        return '<1.5h'

data["runtime_interval"] = data.apply(runtime_interval, axis=1)

In [38]:
# new column with release interval
data = data[data['original_release_date'].notnull()]

def release_decade_interval(row):
    split = row["original_release_date"].split('-')
    year = split[0]
    interval =  str(pd.to_numeric(year)//10*10) + '\'s'
    return interval

data["release_decade_interval"] = data.apply(release_decade_interval, axis=1)

In [39]:
def is_horror(row):
    if 'Horror' in row['genres']:
        return True
    else: return False
data["Horror"] = data.apply(is_horror, axis=1)
    

## top genres

In [40]:
genres_scores = dict()
for row in tqdm(range(data.shape[0])):
    value = data['review_score'][row]
    for genre in data['genres'][row]:
        if genre not in genres_scores.keys():
            genres_scores[genre] = [value]
        else:
            genres_scores[genre].append(value)

  0%|          | 0/670411 [00:00<?, ?it/s]

In [41]:
# calculating average scores for each actor
final_genres_scores = {}
for key in tqdm(genres_scores.keys()):
    # add only actors with 3000+ reviews 
    if len(genres_scores[key]) > 100:
        mean = np.mean(genres_scores[key])
        final_genres_scores[key] = round(mean, 1)

  0%|          | 0/21 [00:00<?, ?it/s]

In [42]:
final_genres_scores
genres_rating = sorted(final_genres_scores.items(), key=lambda x: x[1], reverse=True)
genres_rating

[('Classics', 75.7),
 ('Documentary', 71.5),
 ('Special Interest', 70.4),
 ('Art House & International', 69.5),
 ('Anime & Manga', 69.3),
 ('Sports & Fitness', 67.4),
 ('Animation', 66.8),
 ('Television', 66.8),
 ('Musical & Performing Arts', 66.3),
 ('Western', 66.1),
 ('Drama', 65.7),
 ('Cult Movies', 65.2),
 ('Gay & Lesbian', 63.9),
 ('Science Fiction & Fantasy', 62.6),
 ('Romance', 62.5),
 ('Kids & Family', 62.0),
 ('Faith & Spirituality', 61.7),
 ('Mystery & Suspense', 61.4),
 ('Action & Adventure', 60.7),
 ('Comedy', 60.2),
 ('Horror', 57.2)]

## TOP10 actors

### Kterých top 10 herců/hereček má nejlepší skóre kritiky

In [43]:
# creating dict with following annotation actor: [scores]
data['actors'] = data['actors'].str.split(', ')

actor_scores = dict()
for row in tqdm(range(data.shape[0])):
    value = data['review_score'][row]
    for actor in data['actors'][row]:
        if actor not in actor_scores.keys():
            actor_scores[actor] = [value]
        else:
            actor_scores[actor].append(value)

  0%|          | 0/670411 [00:00<?, ?it/s]

In [44]:
# calculating average scores for each actor
final_scores = {}
for key in tqdm(actor_scores.keys()):
    
    # add only actors with 3000+ reviews 
    if len(actor_scores[key]) > 3000:
        mean = np.mean(actor_scores[key])
        final_scores[key] = round(mean, 1)

  0%|          | 0/197331 [00:00<?, ?it/s]

In [45]:
top10 = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)
top10[0:10]

[('John Ratzenberger', 78.6),
 ('Adam Driver', 75.3),
 ('Michael Stuhlbarg', 75.2),
 ('Andy Serkis', 75.0),
 ('Leonardo DiCaprio', 74.1),
 ('Anthony Daniels', 73.1),
 ('Domhnall Gleeson', 72.5),
 ('Warwick Davis', 72.5),
 ('Tom Holland (II)', 72.3),
 ('Tilda Swinton', 72.0)]

In [46]:
pd.set_option('display.max_columns', None)
data.head(1)

Unnamed: 0_level_0,movie_title,content_rating,genres,directors,authors,actors,original_release_date,streaming_release_date,runtime,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_score_interval,audience_rating_interval,tomatometer_rating_interval,runtime_interval,release_decade_interval,Horror
rotten_tomatoes_link,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
m/0814255,Percy Jackson & the Olympians: The Lightning T...,PG,"[Action & Adventure, Comedy, Drama, Science Fi...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","[Logan Lerman, Brandon T. Jackson, Alexandra D...",2010-02-12,2015-11-25,119.0,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76,Ben McEachen,False,Sunday Mail (Australia),Fresh,70.0,2010-02-09,70-90,50-70,30-50,1.75-2h,2010's,False


### histogram pro hodnocení filmů dle režiséra a production company, release decade a content rating

In [47]:
df = data
df = df.loc[:, ('directors', 'release_decade_interval', 'review_score_interval', 'production_company', 'tomatometer_status', 'tomatometer_top_critics_count', 'content_rating')]



clm = cleverminer(df=df,target='review_score_interval',proc='CFMiner',
               quantifiers= {'S_Up':2, 'Base':750},
                cond={
                    'attributes':[
                        {'name': 'directors', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'production_company', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'release_decade_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'content_rating', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':3, 'maxlen':4, 'type':'con'}
               )

# print(clm.result)
clm.print_summary()
clm.print_rulelist()
clm.print_rule(2)

Cleverminer version 1.0.2. Note: This version is for personal and educational use only. If you need PRO version (support, fixing structures for compactibility in future versions for production deployment, additional development, licensing of commercial use of subroutines used), feel free to ask authors. Most of these functionalities are maintained in best-effort, as soon as this project is at given conditions for free use and rapid development is needed, they cannot be guaranteed.
Starting data preparation ...
Encoding columns into bit-form...
Encoding columns into bit-form...done
Data preparation finished.
Will go for  CFMiner
Starting to mine rules.
Done. Total verifications : 212, rules 131,control number:0, times: prep 15.669250011444092, processing 14.093694925308228

CleverMiner task processing summary:

Task type : CFMiner
Number of verifications : 212
Number of rules : 131
Total time needed : 00h 00m 29s
Time of data preparation : 00h 00m 15s
Time of rule mining : 00h 00m 14s



### histogram pro hodnocení filmů dle kritika a publishera, typu review a je-li kritik top-kritik

In [48]:
df = data
df = df.loc[:, ('critic_name', 'release_decade_interval', 'review_score_interval', 'publisher_name', 'review_type', 'top_critic')]



clm = cleverminer(df=df,target='review_score_interval',proc='CFMiner',
               quantifiers= {'S_Up':2, 'Base':500},
                cond={
                    'attributes':[
                        {'name': 'critic_name', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'publisher_name', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'review_type', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'top_critic', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':3, 'maxlen':4, 'type':'con'}
               )

# print(clm.result)
clm.print_summary()
clm.print_rulelist()
clm.print_rule(2)

Cleverminer version 1.0.2. Note: This version is for personal and educational use only. If you need PRO version (support, fixing structures for compactibility in future versions for production deployment, additional development, licensing of commercial use of subroutines used), feel free to ask authors. Most of these functionalities are maintained in best-effort, as soon as this project is at given conditions for free use and rapid development is needed, they cannot be guaranteed.
Starting data preparation ...
Encoding columns into bit-form...
Encoding columns into bit-form...done
Data preparation finished.
Will go for  CFMiner
Starting to mine rules.
Done. Total verifications : 1514, rules 828,control number:0, times: prep 11.32285475730896, processing 25.42260503768921

CleverMiner task processing summary:

Task type : CFMiner
Number of verifications : 1514
Number of rules : 828
Total time needed : 00h 00m 36s
Time of data preparation : 00h 00m 11s
Time of rule mining : 00h 00m 25s



## Otazka 7 - neni nic moc :D

In [49]:
df = data
df=df.loc[:, ('critic_name','top_critic', 'review_score_interval')]



clm = cleverminer(df=df,proc='4ftMiner',
               quantifiers= {'conf':0.7, 'Base': 100},
               ante ={
                    'attributes':[
                        {'name': 'top_critic', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'critic_name', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':2, 'maxlen':2, 'type':'con'},
               succ ={
                    'attributes':[
                        {'name': 'review_score_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':1, 'maxlen':1, 'type':'con'}
               )

# print(clm.result)
clm.print_summary()
clm.print_rulelist()
clm.print_rule(2)

Cleverminer version 1.0.2. Note: This version is for personal and educational use only. If you need PRO version (support, fixing structures for compactibility in future versions for production deployment, additional development, licensing of commercial use of subroutines used), feel free to ask authors. Most of these functionalities are maintained in best-effort, as soon as this project is at given conditions for free use and rapid development is needed, they cannot be guaranteed.
Starting data preparation ...
Encoding columns into bit-form...
Encoding columns into bit-form...done
Data preparation finished.
Will go for  4ftMiner
Starting to mine rules.
Done. Total verifications : 1525, rules 3,control number:0, times: prep 8.988273859024048, processing 1.9136362075805664

CleverMiner task processing summary:

Task type : 4ftMiner
Number of verifications : 1525
Number of rules : 3
Total time needed : 00h 00m 10s
Time of data preparation : 00h 00m 08s
Time of rule mining : 00h 00m 01s




## Otazka 3

In [50]:
# bezi cca 2m 24s

df = data
df=df.loc[:, ('release_decade_interval', 'review_score_interval', 'genres', 'production_company')]



clm = cleverminer(df=df,proc='4ftMiner',
               quantifiers= {'conf':0.6, 'Base': 50},
               ante ={
                    'attributes':[
                        {'name': 'production_company', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'genres', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'release_decade_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':3, 'maxlen':3, 'type':'con'},
               succ ={
                    'attributes':[
                        {'name': 'review_score_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':1, 'maxlen':1, 'type':'con'}
               )

# print(clm.result)
clm.print_summary()
clm.print_rulelist()
clm.print_rule(2)

Cleverminer version 1.0.2. Note: This version is for personal and educational use only. If you need PRO version (support, fixing structures for compactibility in future versions for production deployment, additional development, licensing of commercial use of subroutines used), feel free to ask authors. Most of these functionalities are maintained in best-effort, as soon as this project is at given conditions for free use and rapid development is needed, they cannot be guaranteed.
Starting data preparation ...
Encoding columns into bit-form...
Encoding columns into bit-form...done
Data preparation finished.
Will go for  4ftMiner
Starting to mine rules.
Done. Total verifications : 2969, rules 278,control number:0, times: prep 5.856578826904297, processing 35.73852205276489

CleverMiner task processing summary:

Task type : 4ftMiner
Number of verifications : 2969
Number of rules : 278
Total time needed : 00h 00m 41s
Time of data preparation : 00h 00m 05s
Time of rule mining : 00h 00m 35s

## Otázka 4

### 4ftMiner

In [51]:

df = data
df=df.loc[:, ('tomatometer_rating_interval', 'release_decade_interval', 'production_company')]



clm = cleverminer(df=df,proc='4ftMiner',
               quantifiers= {'conf':0.8, 'Base': 100},
               ante ={
                    'attributes':[
                        {'name': 'production_company', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'release_decade_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':2, 'maxlen':2, 'type':'con'},
               succ ={
                    'attributes':[
                        {'name': 'tomatometer_rating_interval', 'type': 'one', 'value': '>90'},
                    ], 'minlen':1, 'maxlen':1, 'type':'con'}
               )

# print(clm.result)
clm.print_summary()
clm.print_rulelist()
clm.print_rule(2)

Cleverminer version 1.0.2. Note: This version is for personal and educational use only. If you need PRO version (support, fixing structures for compactibility in future versions for production deployment, additional development, licensing of commercial use of subroutines used), feel free to ask authors. Most of these functionalities are maintained in best-effort, as soon as this project is at given conditions for free use and rapid development is needed, they cannot be guaranteed.
Starting data preparation ...
Encoding columns into bit-form...
Encoding columns into bit-form...done
Data preparation finished.
Will go for  4ftMiner
Starting to mine rules.
Done. Total verifications : 244, rules 50,control number:0, times: prep 4.065985918045044, processing 0.5615181922912598

CleverMiner task processing summary:

Task type : 4ftMiner
Number of verifications : 244
Number of rules : 50
Total time needed : 00h 00m 04s
Time of data preparation : 00h 00m 04s
Time of rule mining : 00h 00m 00s




## Question X: Hledání souvislosti mezi režiséry a review score

### Jak se liší hodnocení filmů na základě režiséra, doby trvání a vhodnosti obsahu

In [52]:
# be patient, runs approx 50secs!
df = data
df=df.loc[:, ('directors', 'runtime_interval', 'content_rating', 'review_score_interval')]

clm = cleverminer(df=df,proc='4ftMiner',
               quantifiers= {'conf':0.8, 'Base': 50},
               ante ={
                    'attributes':[
                        {'name': 'directors', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'runtime_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'content_rating', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':2, 'maxlen':3, 'type':'con'},
               succ ={
                    'attributes':[
                        {'name': 'review_score_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':1, 'maxlen':1, 'type':'con'}
               )

clm.print_summary()
clm.print_rulelist()
clm.print_rule(2)

Cleverminer version 1.0.2. Note: This version is for personal and educational use only. If you need PRO version (support, fixing structures for compactibility in future versions for production deployment, additional development, licensing of commercial use of subroutines used), feel free to ask authors. Most of these functionalities are maintained in best-effort, as soon as this project is at given conditions for free use and rapid development is needed, they cannot be guaranteed.
Starting data preparation ...
Encoding columns into bit-form...
Encoding columns into bit-form...done
Data preparation finished.
Will go for  4ftMiner
Starting to mine rules.
Done. Total verifications : 9661, rules 27,control number:0, times: prep 12.900224924087524, processing 11.365200757980347

CleverMiner task processing summary:

Task type : 4ftMiner
Number of verifications : 9661
Number of rules : 27
Total time needed : 00h 00m 24s
Time of data preparation : 00h 00m 12s
Time of rule mining : 00h 00m 11s

## Question Y: Hledání souvislosti mezi kritiky a review score

### Jak se liší hodnocení filmů na základě kritika, zda je kritic top a  doby trvání

In [53]:
# be patient, runs approx 50secs!
df = data
df=df.loc[:, ('critic_name', 'runtime_interval', 'review_score', 'top_critic', 'review_score_interval')]

clm = cleverminer(df=df,proc='4ftMiner',
               quantifiers= {'conf':0.8, 'Base': 50},
               ante ={
                    'attributes':[
                        {'name': 'critic_name', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'runtime_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'top_critic', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':2, 'maxlen':3, 'type':'con'},
               succ ={
                    'attributes':[
                        {'name': 'review_score_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':1, 'maxlen':1, 'type':'con'}
               )

clm.print_summary()
clm.print_rulelist()
clm.print_rule(2)

Cleverminer version 1.0.2. Note: This version is for personal and educational use only. If you need PRO version (support, fixing structures for compactibility in future versions for production deployment, additional development, licensing of commercial use of subroutines used), feel free to ask authors. Most of these functionalities are maintained in best-effort, as soon as this project is at given conditions for free use and rapid development is needed, they cannot be guaranteed.
Starting data preparation ...
Encoding columns into bit-form...
Encoding columns into bit-form...done
Data preparation finished.
Will go for  4ftMiner
Starting to mine rules.
Done. Total verifications : 8770, rules 9,control number:0, times: prep 10.099143266677856, processing 7.239105224609375

CleverMiner task processing summary:

Task type : 4ftMiner
Number of verifications : 8770
Number of rules : 9
Total time needed : 00h 00m 17s
Time of data preparation : 00h 00m 10s
Time of rule mining : 00h 00m 07s




## bonus: horror movies

In [54]:
horror_scores = dict()
for row in tqdm(range(data.shape[0])):
    if 'Horror' in data['genres'][row]:
        value = data['review_score'][row]
        name = data['movie_title'][row]
        if name not in horror_scores.keys():
            horror_scores[name] = [value]
        else:
            horror_scores[name].append(value)

  0%|          | 0/670411 [00:00<?, ?it/s]

In [55]:
# calculating average scores for each actor
final_horror_scores = {}
for key in tqdm(horror_scores.keys()):
    # add only actors with 3000+ reviews 
    if len(horror_scores[key]) > 50:
        mean = np.mean(horror_scores[key])
        final_horror_scores[key] = round(mean, 1)

  0%|          | 0/1730 [00:00<?, ?it/s]

In [56]:
top_horrors = sorted(final_horror_scores.items(), key=lambda x: x[1], reverse=True)
top_horrors

[('Repulsion', 91.1),
 ('Aliens', 90.8),
 ("Rosemary's Baby", 90.6),
 ('The Silence of the Lambs', 90.2),
 ("Pan's Labyrinth", 89.2),
 ('The Shining', 86.5),
 ('Seconds', 85.5),
 ('Let the Right One In', 85.3),
 ('Psycho', 84.7),
 ('Get Out', 83.6),
 ('Hereditary', 82.3),
 ('A Quiet Place', 81.9),
 ('The Babadook', 81.7),
 ('Night of the Living Dead', 81.4),
 ('It Follows', 81.3),
 ('Under The Shadow', 81.0),
 ('The Lighthouse', 80.5),
 ('The Fly', 80.2),
 ('Re-Animator', 80.2),
 ('A Girl Walks Home Alone at Night', 80.1),
 ('Shaun of the Dead', 80.1),
 ('The Blair Witch Project', 79.8),
 ('The Cabin in the Woods', 79.8),
 ('Us', 79.8),
 ('Raw', 78.6),
 ('The Witch', 77.9),
 ('The Invisible Man', 77.8),
 ('The Love Witch', 76.9),
 ('Drag Me to Hell', 76.6),
 ('Green Room', 76.6),
 ('The Host', 76.6),
 ('The Little Stranger', 76.4),
 ('Train to Busan (Busanhaeng)', 76.4),
 ('Relic', 76.3),
 ('28 Days Later', 76.1),
 ('Midsommar', 76.1),
 ('The Others', 76.0),
 ('Santa Sangre', 75.7),
 (

In [57]:
df = data
df=df.loc[:, ('Horror', 'runtime_interval', 'review_score_interval', 'tomatometer_rating_interval', 'movie_title', 'release_decade_interval', 'production_company')]




clm = cleverminer(df=df,proc='4ftMiner',
               quantifiers= {'conf':0.7, 'Base': 50},
               ante ={
                    'attributes':[
                        {'name': 'Horror', 'type': 'one', 'value': 'True'},
                        {'name': 'runtime_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'production_company', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'release_decade_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':3, 'maxlen':4, 'type':'con'},
               succ ={
                    'attributes':[
                        {'name': 'review_score_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':1, 'maxlen':1, 'type':'con'}
               )

clm.print_summary()
clm.print_rulelist()
clm.print_rule(2)

Cleverminer version 1.0.2. Note: This version is for personal and educational use only. If you need PRO version (support, fixing structures for compactibility in future versions for production deployment, additional development, licensing of commercial use of subroutines used), feel free to ask authors. Most of these functionalities are maintained in best-effort, as soon as this project is at given conditions for free use and rapid development is needed, they cannot be guaranteed.
Starting data preparation ...
Encoding columns into bit-form...
Encoding columns into bit-form...done
Data preparation finished.
Will go for  4ftMiner
Starting to mine rules.
Done. Total verifications : 3412, rules 29,control number:0, times: prep 26.0899760723114, processing 4.855337858200073

CleverMiner task processing summary:

Task type : 4ftMiner
Number of verifications : 3412
Number of rules : 29
Total time needed : 00h 00m 30s
Time of data preparation : 00h 00m 26s
Time of rule mining : 00h 00m 04s




## Bonus: Nicolas Cage

In [58]:
pd.set_option('display.max_columns', None)
data.head(1)

Unnamed: 0_level_0,movie_title,content_rating,genres,directors,authors,actors,original_release_date,streaming_release_date,runtime,production_company,tomatometer_status,tomatometer_rating,tomatometer_count,audience_status,audience_rating,audience_count,tomatometer_top_critics_count,tomatometer_fresh_critics_count,tomatometer_rotten_critics_count,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_score_interval,audience_rating_interval,tomatometer_rating_interval,runtime_interval,release_decade_interval,Horror
rotten_tomatoes_link,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
m/0814255,Percy Jackson & the Olympians: The Lightning T...,PG,"[Action & Adventure, Comedy, Drama, Science Fi...",Chris Columbus,"Craig Titley, Chris Columbus, Rick Riordan","[Logan Lerman, Brandon T. Jackson, Alexandra D...",2010-02-12,2015-11-25,119.0,20th Century Fox,Rotten,49.0,149.0,Spilled,53.0,254421.0,43,73,76,Ben McEachen,False,Sunday Mail (Australia),Fresh,70.0,2010-02-09,70-90,50-70,30-50,1.75-2h,2010's,False


In [59]:
def is_nicolas(row):
    if 'Nicolas Cage' in row['actors']:
        return True
    else: return False
data["nicolas cage"] = data.apply(is_nicolas, axis=1)

In [60]:
#JSOU FILMY S NICOLASEM CAGEM DOBRÉ?
df = data
df=df.loc[:, ('nicolas cage', 'runtime_interval', 'review_score_interval', 'release_decade_interval', 'production_company', 'content_rating')]




clm = cleverminer(df=df,proc='4ftMiner',
               quantifiers= {'conf':0.4, 'Base': 50},
               ante ={
                    'attributes':[
                        {'name': 'nicolas cage', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'runtime_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'production_company', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':2, 'maxlen':3, 'type':'con'},
               succ ={
                    'attributes':[
                        {'name': 'review_score_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1}
                    ], 'minlen':1, 'maxlen':1, 'type':'con'}
               )

clm.print_summary()
clm.print_rulelist()
clm.print_rule(2)

Cleverminer version 1.0.2. Note: This version is for personal and educational use only. If you need PRO version (support, fixing structures for compactibility in future versions for production deployment, additional development, licensing of commercial use of subroutines used), feel free to ask authors. Most of these functionalities are maintained in best-effort, as soon as this project is at given conditions for free use and rapid development is needed, they cannot be guaranteed.
Starting data preparation ...
Encoding columns into bit-form...
Encoding columns into bit-form...done
Data preparation finished.
Will go for  4ftMiner
Starting to mine rules.
Done. Total verifications : 5145, rules 1546,control number:0, times: prep 4.47517991065979, processing 4.737284898757935

CleverMiner task processing summary:

Task type : 4ftMiner
Number of verifications : 5145
Number of rules : 1546
Total time needed : 00h 00m 09s
Time of data preparation : 00h 00m 04s
Time of rule mining : 00h 00m 04

### SD4ftMiner

#### jestli maji Nicolase radsi top kritici, nebo ti ostatni

In [61]:
df = data
df=df.loc[:, ('nicolas cage', 'review_score', 'top_critic')]


clm = cleverminer(df=df,proc='SD4ftMiner',
               quantifiers= {'Base1':50, 'Base2':50, 'Ratioconf' : 2.0},
               ante ={
                    'attributes':[
                        {'name': 'nicolas cage', 'type': 'one', 'value': 'True'},
                    ], 'minlen':1, 'maxlen':1, 'type':'con'},
               succ ={
                    'attributes':[
                        {'name': 'review_score', 'type': 'subset', 'minlen': 1, 'maxlen': 1}
                    ], 'minlen':1, 'maxlen':1, 'type':'con'},
               frst ={
                    'attributes':[
                        {'name': 'top_critic', 'type': 'subset', 'minlen': 1, 'maxlen': 1}
                    ], 'minlen':1, 'maxlen':1, 'type':'con'},
               scnd ={
                    'attributes':[
                        {'name': 'top_critic', 'type': 'subset', 'minlen': 1, 'maxlen': 1}
                    ], 'minlen':1, 'maxlen':1, 'type':'con'}
               )

clm.print_summary()
clm.print_rulelist()
clm.print_rule(1)
clm.print_rule(2)

Cleverminer version 1.0.2. Note: This version is for personal and educational use only. If you need PRO version (support, fixing structures for compactibility in future versions for production deployment, additional development, licensing of commercial use of subroutines used), feel free to ask authors. Most of these functionalities are maintained in best-effort, as soon as this project is at given conditions for free use and rapid development is needed, they cannot be guaranteed.
Starting data preparation ...
Encoding columns into bit-form...
Encoding columns into bit-form...done
Data preparation finished.
Will go for  SD4ftMiner
Starting to mine rules.
Done. Total verifications : 56, rules 2,control number:0, times: prep 0.9212470054626465, processing 0.2084050178527832

CleverMiner task processing summary:

Task type : SD4ftMiner
Number of verifications : 56
Number of rules : 2
Total time needed : 00h 00m 01s
Time of data preparation : 00h 00m 00s
Time of rule mining : 00h 00m 00s



#### Jak se obecně liší top kritici vs norm kritici na základě hodnocení filmu podle délky, produkční firmy nebo contnt ratingu

In [62]:
df = data
df=df.loc[:, ('runtime_interval', 'production_company', 'content_rating', 'genres', 'review_score', 'top_critic')]




clm = cleverminer(df=df,proc='SD4ftMiner',
               quantifiers= {'Base1':2000, 'Base2':2000, 'Ratioconf' : 2.0},
               ante ={
                    'attributes':[
                        {'name': 'runtime_interval', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'production_company', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                        {'name': 'content_rating', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                    ], 'minlen':2, 'maxlen':3, 'type':'con'},
               succ ={
                    'attributes':[
                        {'name': 'review_score', 'type': 'subset', 'minlen': 1, 'maxlen': 1}
                    ], 'minlen':1, 'maxlen':1, 'type':'con'},
               frst ={
                    'attributes':[
                        {'name': 'top_critic', 'type': 'subset', 'minlen': 1, 'maxlen': 1}
                    ], 'minlen':1, 'maxlen':1, 'type':'con'},
               scnd ={
                    'attributes':[
                        {'name': 'top_critic', 'type': 'subset', 'minlen': 1, 'maxlen': 1}
                    ], 'minlen':1, 'maxlen':1, 'type':'con'}
               )

clm.print_summary()
clm.print_rulelist()
clm.print_rule(1)

Cleverminer version 1.0.2. Note: This version is for personal and educational use only. If you need PRO version (support, fixing structures for compactibility in future versions for production deployment, additional development, licensing of commercial use of subroutines used), feel free to ask authors. Most of these functionalities are maintained in best-effort, as soon as this project is at given conditions for free use and rapid development is needed, they cannot be guaranteed.
Starting data preparation ...
Encoding columns into bit-form...
Encoding columns into bit-form...done
Data preparation finished.
Will go for  SD4ftMiner
Starting to mine rules.
Done. Total verifications : 444, rules 3,control number:0, times: prep 6.597846984863281, processing 39.004878997802734

CleverMiner task processing summary:

Task type : SD4ftMiner
Number of verifications : 444
Number of rules : 3
Total time needed : 00h 00m 45s
Time of data preparation : 00h 00m 06s
Time of rule mining : 00h 00m 39s
