In [76]:
#
#  Tzerefos Anargiros - 2022202004022 - dit2022dsc@office365.uop.gr
#  Deligiannis Panagiotis - 2022202004004 - dit2004dsc@office365.uop.gr
#
# IMPORT REQUIRED DEPENDENCIES
from pymongo import MongoClient
import pandas as pd
import json

In [85]:
# MONGODB HOST e.g. //localhost
host = '//localhost' 

# MONGODB POPRT e.g. //27017
port = '27017'

# CONNECT TO MONGODB
client = MongoClient('mongodb:'+host+':'+port+'/?readPreference=primary&appname=MongoDB%20Compass&ssl=false')

# THE OUTPUT PATH FOR THE PRODUCED JSON FILE 
json_output_path = "../data/output/netflix_titles.json" 

# TASK OUTPUT PATHS
task_1_output = 'output/task_1/'
task_2_output = 'output/task_2/'
task_3_output = 'output/task_3/'
task_4_output = 'output/task_4/'
task_5_output = 'output/task_5/'

In [86]:
# LOAD JSON DATA
with open(json_output_path) as f:
    file_data = json.load(f)

# DATABASE NAME
dbname = 'netflix'
    
# COLLECTION NAME
collection = 'movies'
    
# GET DB NAMES
dblist = client.list_database_names()

# IMPORT DATA IN COLLECTION. 
# IF COLLECTION AND DB ALREADY EXIST, DELETE EVERYTHING AND IMPORT
if dbname in dblist:
    collnames = client[dbname].list_collection_names()
    if collection in collnames:
        client[dbname][collection].delete_many({})    
    client[dbname][collection].insert_many(file_data)
    db = client[dbname][collection]
else:
    client[dbname][collection].insert_many(file_data)
    db = client[dbname][collection]


In [87]:
task_1 = db.aggregate([
    {
        '$match': {
            'date_added': {
                '$regex': '2019$'
            }
        }
    }, {
        '$project': {
            'show_id': 1, 
            'type': 1, 
            'title': 1
        }
    }
])

# APPEND QUERY RESULTS TO DATAFRAME
df =  pd.DataFrame(list(task_1)) 

# SELECT THE COLUMNS REQUIRED FOR THE TASK
columns = ['show_id', 'title', 'type'] 
df = df[columns]

# CREATE THE CSV AND JSON OUTPUT FILES
df.to_csv(task_1_output+"task_1.csv", index = False)
df.to_json(task_1_output+"task_1.json", orient = "records", date_format = "epoch", force_ascii = True, default_handler = None)

# DISPLAY RESULTS

display(df[:20])

Unnamed: 0,show_id,title,type
0,81145628,Norm of the North: King Sized Adventure,Movie
1,80221550,Archibald's Next Big Thing,TV Show
2,81154455,Article 15,Movie
3,81113928,Care of Kancharapalem,Movie
4,81052275,Ee Nagaraniki Emaindi,Movie
5,81132437,Kill Me If You Dare,Movie
6,80178151,The Spy,TV Show
7,81176188,American Factory: A Conversation with the Obamas,Movie
8,81160036,Saawan,Movie
9,81173255,The Heretics,Movie


In [91]:
task_2 = db.aggregate([
        {
            '$addFields': {
                'countries': {
                    '$split': [
                        '$country', ', '
                    ]
                }
            }
        }, {
            '$match': {
                'type': 'TV Show'
            }
        }, {
            '$unwind': {
                'path': '$countries'
            }
        }, {
            '$group': {
                '_id': '$countries', 
                'count': {
                    '$sum': 1
                }
            }
        }, {
            '$sort': {
                'count': -1
            }
        }
    ])

# APPEND QUERY RESULTS TO DATAFRAME
df =  pd.DataFrame(list(task_2)) 

# SELECT THE COLUMNS REQUIRED FOR THE TASK
columns = ['_id', 'count'] 
df = df[columns]

# CREATE THE CSV AND JSON OUTPUT FILES
df.to_csv(task_2_output+"task_2.csv", index=False)
df.to_json(task_2_output+"task_2.json", orient = "records", date_format = "epoch", force_ascii = True, default_handler = None)

# DISPLAY RESULTS
df = df.rename(columns = {'_id':'country'}, inplace = False)
display(df[:20])

Unnamed: 0,country,count
0,United States,686
1,United Kingdom,223
2,Japan,156
3,South Korea,116
4,Canada,107
5,France,70
6,Taiwan,65
7,India,55
8,Australia,50
9,Mexico,45


In [92]:
task_3 = db.aggregate([
    {
        '$addFields': {
            'genres': {
                '$split': [
                    '$listed_in', ', '
                ]
            }
        }
    }, {
        '$unwind': {
            'path': '$genres'
        }
    }, {
        '$group': {
            '_id': '$genres', 
            'count': {
                '$sum': 1
            }
        }
    }, {
        '$sort': {
            'count': -1
        }
    }
])
# APPEND QUERY RESULTS TO DATAFRAME
df =  pd.DataFrame(list(task_3)) 

# SELECT THE COLUMNS REQUIRED FOR THE TASK
columns = ['_id', 'count'] 
df = df[columns]

# CREATE THE CSV AND JSON OUTPUT FILES
df.to_csv(task_3_output+"task_3.csv", index=False)
df.to_json(task_3_output+"task_3.json", orient = "records", date_format = "epoch", force_ascii = True, default_handler = None)

# DISPLAY RESULTS
df = df.rename(columns = {'_id':'genre'}, inplace = False)
display(df[:20])

Unnamed: 0,genre,count
0,International Movies,1927
1,Dramas,1623
2,Comedies,1113
3,International TV Shows,1001
4,Documentaries,668
5,TV Dramas,599
6,Action & Adventure,597
7,Independent Movies,552
8,TV Comedies,436
9,Thrillers,392


In [93]:
task_4 = db.aggregate([
    {
        '$addFields': {
            'actors': {
                '$split': [
                    '$cast', ', '
                ]
            }
        }
    }, {
        '$unwind': {
            'path': '$actors'
        }
    }, {
        '$group': {
            '_id': '$actors', 
            'count': {
                '$sum': 1
            }
        }
    }, {
        '$sort': {
            'count': -1
        }
    }
])
# APPEND QUERY RESULTS TO DATAFRAME
df =  pd.DataFrame(list(task_4)) 

# SELECT THE COLUMNS REQUIRED FOR THE TASK
columns = ['_id', 'count'] 
df = df[columns]

# CREATE THE CSV AND JSON OUTPUT FILES
df.to_csv(task_4_output+"task_4.csv", index=False)
df.to_json(task_4_output+"task_4.json", orient = "records", date_format = "epoch", force_ascii = True, default_handler = None)

# DISPLAY RESULTS
df = df.rename(columns = {'_id':'actor'}, inplace = False)
display(df[:20])

Unnamed: 0,actor,count
0,Anupam Kher,33
1,Shah Rukh Khan,30
2,Om Puri,27
3,Naseeruddin Shah,27
4,Yuki Kaji,26
5,Akshay Kumar,26
6,Takahiro Sakurai,25
7,Paresh Rawal,25
8,Amitabh Bachchan,24
9,Boman Irani,23


In [95]:
task_5 = db.aggregate([
    {
        '$addFields': {
            'actors': {
                '$split': [
                    '$cast', ', '
                ]
            }
        }
    }, {
        '$unwind': {
            'path': '$actors'
        }
    }, {
        '$addFields': {
            'genre': {
                '$split': [
                    '$listed_in', ', '
                ]
            }
        }
    }, {
        '$unwind': {
            'path': '$genre'
        }
    }, {
        '$group': {
            '_id': {
                'id': '$actors', 
                'genre': '$genre'
            }, 
            'count': {
                '$sum': 1
            }
        }
    }, {
        '$group': {
            '_id': '$_id.id', 
            'term_count': {
                '$push': {
                    'genre': '$_id.genre', 
                    'count': '$count'
                }
            }
        }
    }, {
        '$sort': {
            'term_count.count': -1
        }
    }, {
        '$unwind': {
            'path': '$term_count'
        }
    }, {
        '$sort': {
            'term_count.count': -1
        }
    }, {
        '$group': {
            '_id': '$_id', 
            'term_count_ordered': {
                '$push': '$term_count'
            }
        }
    }, {
        '$sort': {
            'term_count_ordered.count': -1
        }
    }, {
        '$project': {
            '_id': 1, 
            'type': {
                '$arrayElemAt': [
                    '$term_count_ordered.genre', 0
                ]
            }, 
            'count': {
                '$arrayElemAt': [
                    '$term_count_ordered.count', 0
                ]
            }
        }
    }, {
        '$sort': {
            '_id': 1
        }
    }
])
# APPEND QUERY RESULTS TO DATAFRAME
df =  pd.DataFrame(list(task_5)) 

# SELECT THE COLUMNS REQUIRED FOR THE TASK
columns = ['_id', 'type', 'count',] 
df = df[columns]

# CREATE THE CSV AND JSON OUTPUT FILES
df.to_csv(task_5_output+"task_5.csv", index=False)
df.to_json(task_5_output+"task_5.json", orient = "records", date_format = "epoch", force_ascii = True, default_handler = None)

# DISPLAY RESULTS
df = df.rename(columns = {'_id':'actor', 'type':'genre'}, inplace = False)
display(df[:20])

Unnamed: 0,actor,genre,count
0,2 Chainz,Docuseries,1
1,4Minute,Music & Musicals,1
2,50 Cent,Action & Adventure,2
3,A Boogie Wit tha Hoodie,Docuseries,1
4,A-ra Go,International TV Shows,1
5,A. Murat Özgen,International Movies,1
6,A.C. Peterson,Dramas,1
7,A.D. Miles,TV Comedies,2
8,A.J. Cook,Horror Movies,1
9,A.J. LoCascio,Kids' TV,2
