In [1]:
import os 
import pandas as pd 
import numpy as np 
from MLModels.preprocess import data_quality_check 


[INFO] movie_server_ip = 128.2.204.215


In [2]:
def read_mpg(mpg_path):
    with open(mpg_path, "r") as f:
        lines = f.readlines()
    f.close()
    
    # calculate unique users and movies
    movies = set()
    users = set()
    
    for line in lines:
        valid, tp = data_quality_check(line, "mpg")
        if valid:
            movies.add(tp[2])
            users.add(tp[1])
        else:
            continue
    
    return len(movies), len(users)

In [3]:
def read_ratings(ratings_path):
    with open(ratings_path, "r") as f:
        lines = f.readlines()
    f.close()
    
    total_rating = 0
    num_of_rating_records = 0
    
    for line in lines:
        valid, tp = data_quality_check(line, "rating")
        if valid:
            total_rating += tp[3]
            num_of_rating_records += 1
        else:
            continue
            
    return total_rating, num_of_rating_records


In [5]:
def calculate_metrics(paths):
    num_of_unique_movies_watched, num_of_active_users, total_rating, num_of_rating_records = 0, 0, 0, 0

    for log_path in paths:
        for filename in os.listdir(log_path):
            if filename[:3] == "mpg":
                _1, _2 = read_mpg(log_path + filename)
                num_of_unique_movies_watched += _1
                num_of_active_users += _2
            elif filename[:7] == "ratings":
                _1, _2 = read_ratings(log_path + filename)
                total_rating += _1
                num_of_rating_records += _2
    
    return num_of_unique_movies_watched, num_of_active_users, total_rating / num_of_rating_records


### take logs from: 2023-04-21 2100-2300

In [11]:
# get num_of_unique_movies_watched, num_of_active_users
num_of_unique_movies_watched, num_of_active_users, avg_rating = calculate_metrics(["data_main/2023-04-21/2100-2200/", "data_main/2023-04-21/2200-2300/"])

print("number of unique movies watched during 2023-04-21 2100-2300: ", num_of_unique_movies_watched)
print("number of active users during 2023-04-21 2100-2300: ", num_of_active_users)
print("average rating during 2023-04-21 2100-2300: ", avg_rating)

[INFO] got invalid movie: aa+man+escaped+1956
[INFO] got invalid movie: the+hobbit+the+battle+of+the+afive+armies+2014
[INFO] got invalid movie: the+lord+of+the+ri5ngs+the+fellowship+of+the+ring+2001
[INFO] got invalid movie: many+reivers+to+cross+1955
[INFO] got invalid movie: children+whno+chase+lost+voices+2011
[INFO] got invalid movie: jellyfwish+2007
[INFO] got invalid movie: inceptionq+2010
[INFO] got invalid movie: raiders+of+the+lost+arkr+1981
[INFO] got invalid movie: sthe+fortune+1975
[INFO] got invalid movie: babylon0+5+in+the+beginning+1998
[INFO] got invalid movie: a+shodrt+film+about+love+1988
[INFO] got invalid movie: t3he+lord+of+the+rings+the+return+of+the+king+2003
[INFO] got invalid movie: raider4s+of+the+lost+ark+1981
[INFO] got invalid movie: samurai9+rebellion+1967
[INFO] got invalid movie: schinhdlers+list+1993
[INFO] got invalid movie: one+flew+over+the+cuckoos+ne1st+1975
[INFO] got invalid movie: the+god7father+1972
[INFO] got invalid movie: how+tdo+train+your+

### take logs from: 2023-04-22 0000-0200

In [8]:
# get num_of_unique_movies_watched, num_of_active_users
num_of_unique_movies_watched, num_of_active_users, avg_rating = calculate_metrics(["data_main/2023-04-22/0000-0100/", "data_main/2023-04-22/0100-0200/"])

print("number of unique movies watched during 2023-04-22 0000-0200: ", num_of_unique_movies_watched)
print("number of active users during 2023-04-22 0000-0200: ", num_of_active_users)
print("average rating during 2023-04-22 0000-0200: ", avg_rating)

[INFO] got invalid movie: the+mshawshank+redemption+1994
[INFO] got invalid movie: schindlers+olist+1993
[INFO] got invalid movie: one+flew+over+the+cuckoosz+nest+1975
[INFO] got invalid movie: coming+down+the+mountlain+2007
[INFO] got invalid movie: princess+mono8noke+1997
[INFO] got invalid movie: the+ltord+of+the+rings+the+return+of+the+king+2003
[INFO] got invalid movie: make+way+for+to4morrow+1937
[INFO] got invalid movie: thce+lord+of+the+rings+the+fellowship+of+the+ring+2001
[INFO] got invalid movie: the+shawshank+redejmption+1994
[INFO] got invalid movie: genghis+blueps+1999
[INFO] got invalid movie: le+rtrou+1960
[INFO] got invalid movie: the+lord+of+thse+rings+the+fellowship+of+the+ring+2001
[INFO] got invalid movie: addagms+family+reunion+1998
[INFO] got invalid movie: toy+stovry+3+2010
[INFO] got invalid movie: harry+pottqer+and+the+deathly+hallows+part+1+2010
[INFO] got invalid movie: samurai+rebzellion+1967
[INFO] got invalid movie: the+thin+mhan+goes+home+1945
[INFO] got

### take logs from: 2023-04-21 0300-0500

In [15]:
# get num_of_unique_movies_watched, num_of_active_users
num_of_unique_movies_watched, num_of_active_users, avg_rating = calculate_metrics(["data_main/2023-04-21/0300-0400/", "data_main/2023-04-21/0400-0500/"])

print("number of unique movies watched during 2023-04-21 0300-0500: ", num_of_unique_movies_watched)
print("number of active users during 2023-04-21 0300-0500: ", num_of_active_users)
print("average rating during 2023-04-21 0300-0500: ", avg_rating)

number of unique movies watched during 2023-04-21 0300-0500:  25575
number of active users during 2023-04-21 0300-0500:  205399
average rating during 2023-04-21 0300-0500:  4.003090347209598


## before model update
### take logs from: 2023-04-24 1700-1900

In [6]:
# get num_of_unique_movies_watched, num_of_active_users
num_of_unique_movies_watched, num_of_active_users, avg_rating = calculate_metrics(["data_main/2023-04-24/1700-1800/", "data_main/2023-04-24/1800-1900/"])

print("number of unique movies watched during 2023-04-24 1700-1900: ", num_of_unique_movies_watched)
print("number of active users during 2023-04-24 1700-1900: ", num_of_active_users)
print("average rating during 2023-04-24 1700-1900: ", avg_rating)

[INFO] got invalid movie: milk+2008
[INFO] got invalid movie: the+boys+1962
[INFO] got invalid movie: 1+2013
[INFO] got invalid movie: beneath+2013
[INFO] got invalid movie: hamlet+2000
[INFO] got invalid movie: adrift+2009
[INFO] got invalid movie: the6+lord+of+the+rings+the+return+of+the+king+2003
[INFO] got invalid movie: nightmcrawler+2014
[INFO] got invalid movie: the+lord+of+the+rings+the+feillowship+of+the+ring+2001
[INFO] got invalid movie: chaos+2005
[INFO] got invalid movie: blades+runner+1982
[INFO] got invalid movie: black+sheep+2006
[INFO] got invalid movie: swiss+family+robinsorn+1960
[INFO] got invalid movie: emma+1996
[INFO] got invalid movie: rage+2009
[INFO] got invalid movie: the+go6dfather+1972
[INFO] got invalid movie: sevein+samurai+1954
[INFO] got invalid movie: aftermath+2012
[INFO] got invalid movie: the2+lives+of+others+2006
[INFO] got invalid movie: despicab8le+me+2+2013
[INFO] got invalid movie: the+oshawshank+redemption+1994
[INFO] got invalid movie: the+da

## 1st model update on 2023-04-24 19:57:50
### take logs from: 2023-04-24 2100-2300

In [9]:
# get num_of_unique_movies_watched, num_of_active_users
num_of_unique_movies_watched, num_of_active_users, avg_rating = calculate_metrics(["data_main/2023-04-24/2100-2200/", "data_main/2023-04-24/2200-2300/"])

print("number of unique movies watched during 2023-04-24 2100-2300: ", num_of_unique_movies_watched)
print("number of active users during 2023-04-24 2100-2300: ", num_of_active_users)
print("average rating during 2023-04-24 2100-2300: ", avg_rating)

number of unique movies watched during 2023-04-24 2100-2300:  31544
number of active users during 2023-04-24 2100-2300:  255783
average rating during 2023-04-24 2100-2300:  3.980235167399695


## 2nd model update on 2023-04-24 23:12:24
### take logs from: 2023-04-25 0000-0200

In [16]:
# get num_of_unique_movies_watched, num_of_active_users
num_of_unique_movies_watched, num_of_active_users, avg_rating = calculate_metrics(["data_main/2023-04-25/0000-0100/", "data_main/2023-04-25/0100-0200/"])

print("number of unique movies watched during 2023-04-25 0000-0200: ", num_of_unique_movies_watched)
print("number of active users during 2023-04-25 0000-0200: ", num_of_active_users)
print("average rating during 2023-04-25 0000-0200: ", avg_rating)

number of unique movies watched during 2023-04-25 0000-0200:  30581
number of active users during 2023-04-25 0000-0200:  249041
average rating during 2023-04-25 0000-0200:  4.035328422862344


## 3rd model update on 2023-04-25 02:29:06
### take logs from: 2023-04-25 0300-0500

In [17]:
# get num_of_unique_movies_watched, num_of_active_users
num_of_unique_movies_watched, num_of_active_users, avg_rating = calculate_metrics(["data_main/2023-04-25/0300-0400/", "data_main/2023-04-25/0400-0500/"])

print("number of unique movies watched during 2023-04-25 0300-0500: ", num_of_unique_movies_watched)
print("number of active users during 2023-04-25 0300-0500: ", num_of_active_users)
print("average rating during 2023-04-25 0300-0500: ", avg_rating)

[INFO] got invalid movie: wild+boys+of+the+rotad+1933
[INFO] got invalid movie: tohe+usual+suspects+1995
[INFO] got invalid movie: the+god1father+1972
[INFO] got invalid movie: le+5trou+1960
[INFO] got invalid movie: schindlers+3list+1993
[INFO] got invalid movie: the+lord+of+the+rings+the+fellowship+of+pthe+ring+2001
[INFO] got invalid movie: the+usuale+suspects+1995
[INFO] got invalid movie: anchors+awe2igh+1945
[INFO] got invalid movie: the+lord+of+the+r1ings+the+fellowship+of+the+ring+2001
[INFO] got invalid movie: incepvtion+2010
[INFO] got invalid movie: the+hobbit+the+battle+off+the+five+armies+2014
[INFO] got invalid movie: th2e+avengers+2012
[INFO] got invalid movie: kung+fu+hustole+2004
[INFO] got invalid movie: le+wtrou+1960
[INFO] got invalid movie: hustlet++flow+2005
[INFO] got invalid movie: t4he+usual+suspects+1995
[INFO] got invalid movie: the+lord+gof+the+rings+the+fellowship+of+the+ring+2001
[INFO] got invalid movie: the+lord+of+the+rinags+the+fellowship+of+the+ring+2