## MovieDataLoader

In [206]:
import pandas as pd
import os

class MovieDataLoader:
    def __init__(self):
        """Initialize without parameters."""
        pass

    def load(self, file_path, encoding="utf-8"):
        """
        Load a single file into a DataFrame.

        Parameters:
        - file_path: str, the path to the file to load.
        - encoding: str, encoding to use for reading the file.

        Returns:
        - DataFrame: the loaded DataFrame.
        """
        try:
            # Check if the file exists
            if not os.path.exists(file_path):
                raise FileNotFoundError(f"File not found: {file_path}")

            # Load data based on file extension
            if file_path.endswith(".csv"):
                return pd.read_csv(file_path, engine='c', encoding=encoding)
            elif file_path.endswith(".txt"):
                return pd.read_csv(file_path, delimiter='[|\t]', engine='python', header=None, encoding=encoding)
            else:
                raise ValueError("Unsupported file format. Only .csv and .txt are supported.")
        except FileNotFoundError as e:
            print(f"Error: {e}")
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
        return None

    @staticmethod
    def remove_first_row(df):
        """Remove the first row of the DataFrame and reset column names."""
        df.columns = df.iloc[0]  # Set first row as column names
        df = df.iloc[1:]  # Remove the first row
        df.reset_index(drop=True, inplace=True)
        return df

    @staticmethod
    def change_datatype(df, column, dtype):
        """
        Change the data type of a specific column.
        가장 마지막에 호출해야 함(NaN값이면 터짐)
        """
        df[column] = df[column].astype(dtype)
        return df

    @staticmethod
    def fill_each_other(df, col1, col2):
        """Fill missing values in one column using values from another column."""
        df.loc[df[col1].isnull() & df[col2].notnull(), col1] = df[col2]
        df.loc[df[col2].isnull() & df[col1].notnull(), col2] = df[col1]
        df = df.dropna(subset=[col1, col2])  # Drop rows where both columns are null
        return df

    @staticmethod
    def fill_val(df, column, value):
        """Fill missing values in a specific column with a given value."""
        df[column] = df[column].fillna(value)
        return df

## Load Dataset

In [207]:
kmrd_path = "../../kmrd-small"
file_paths = {
    'countries.csv': f"{kmrd_path}/countries.csv",
    'movies.txt': f"{kmrd_path}/movies.txt",
    'genres.csv': f"{kmrd_path}/genres.csv",
    'rates.csv': f"{kmrd_path}/rates.csv",
    'peoples.txt': f"{kmrd_path}/peoples.txt",
    'castings.csv': f"{kmrd_path}/castings.csv",
}

loader = MovieDataLoader()

# Process 'countries.csv'
countries_df = loader.load(file_paths['countries.csv'])

# Process 'movies.txt'
movies_df = loader.load(file_paths['movies.txt'])
movies_df = MovieDataLoader.remove_first_row(movies_df)
movies_df = MovieDataLoader.fill_each_other(movies_df, 'title', 'title_eng')
movies_df = MovieDataLoader.fill_val(movies_df, 'grade', 'Unknown')
movies_df = MovieDataLoader.fill_val(movies_df, 'year', 0)
movies_df = MovieDataLoader.change_datatype(movies_df, 'year', int)
movies_df = MovieDataLoader.change_datatype(movies_df, 'movie', int)

# Process 'genres.csv'
genres_df = loader.load(file_paths['genres.csv'])

# Process 'rates.csv': Convert Unix timestamp to datetime
rates_df = loader.load(file_paths['rates.csv'])
rates_df['time'] = pd.to_datetime(rates_df['time'], unit='s')

# Process 'peoples.txt'
peoples_df = loader.load(file_paths['peoples.txt'])
peoples_df = MovieDataLoader.remove_first_row(peoples_df)
peoples_df = MovieDataLoader.fill_val(peoples_df, 'original', 'Unknown')
peoples_df = MovieDataLoader.change_datatype(peoples_df, 'people', int)

# Process 'castings.csv'
castings_df = loader.load(file_paths['castings.csv'])

In [208]:
import matplotlib.pyplot as plt

# 한글 폰트 설정
plt.rcParams['font.family'] = 'AppleGothic'  # MacOS: 'AppleGothic', Windows: 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False  # 마이너스 기호 깨짐 방지

In [209]:

# # Print head of each DataFrame to verify
# print("Countries DataFrame:")
# print(countries_df.head())
# print(countries_df.info())

print("\nMovies DataFrame:")
print(movies_df.head())
print(movies_df.info())

# print("\nGenres DataFrame:")
# print(genres_df.head())
# print(genres_df.info())

print("\nRates DataFrame:")
print(rates_df.head())
print(rates_df.info())

# print("\nPeoples DataFrame:")
# print(peoples_df.head())
# print(peoples_df.info())

# print("\nCastings DataFrame:")
# print(castings_df.head())
# print(castings_df.info())



Movies DataFrame:
0  movie                 title                           title_eng  year  \
0  10001                시네마 천국              Cinema Paradiso , 1988  2013   
1  10002              빽 투 더 퓨쳐           Back To The Future , 1985  2015   
2  10003            빽 투 더 퓨쳐 2    Back To The Future Part 2 , 1989  2015   
3  10004            빽 투 더 퓨쳐 3  Back To The Future Part III , 1990  1990   
4  10005  스타워즈 에피소드 4 - 새로운 희망                    Star Wars , 1977  1997   

0    grade  
0   전체 관람가  
1  12세 관람가  
2  12세 관람가  
3   전체 관람가  
4       PG  
<class 'pandas.core.frame.DataFrame'>
Index: 992 entries, 0 to 998
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   movie      992 non-null    int64 
 1   title      992 non-null    object
 2   title_eng  992 non-null    object
 3   year       992 non-null    int64 
 4   grade      992 non-null    object
dtypes: int64(2), object(3)
memory usage: 46.5+ KB
None

Rates DataFrame:


## RandomRecommender

In [210]:
import pandas as pd
import numpy as np

class RandomRecommender:
    def __init__(self, movies_df, rates_df):
        """
        Initialize with movies and ratings DataFrames.
        """
        self.movies_df = movies_df
        self.rates_df = rates_df

    def run(self, n=3, variation=50):
        """
        Generate random ratings and return the resulting DataFrame.
        
        Parameters:
        - n (int): Number of top results to return.
        - variation (int): Variation percentage for random error, applied as 1 ± (variation / 100).
        """
        # Convert variation to percentage-based float
        variation = variation / 100.0
        
        top_n_df = (self.movies_df.merge(self.rates_df, on="movie")
                    .assign(rate_random=lambda df: df["rate"] * np.random.uniform(1 - variation, 1 + variation, size=len(df)))
                    .groupby("movie", as_index=False)
                    .agg({"rate_random": "mean", "title": "first"})
                    .nlargest(n, "rate_random"))
    
        return top_n_df

1. DataFrame.assign(new_column_name=calculation_or_value)
: 새로운 열을 추가하거나, 가존 열을 수정하는 함수

2. lambda arguments: expression   
arguments: 함수의 인자.  
expression: 반환할 계산식(항상 값을 반환함).  
   
- lambda df: df["rate"] * np.random.uniform(0.5, 1.5, size=len(df)
df를 인자로 받아 df["rate"] * np.random.uniform(0.5, 1.5, size=len(df)를 반환한다. 

3. random.uniform(a, b) : [a, b] 범위 내에서 임의로 선택된 실수 값을 반환.  
- np.random.uniform(low, high, size) : np.random.uniform은 배열 형태로 다수의 난수를 생성
low: 범위의 최소값 (포함).  
high: 범위의 최대값 (포함).  
size: 생성할 난수의 개수 (배열 크기).  
                                              
- np.random.uniform(0.5, 1.5, size=len(df)):   
NumPy의 random.uniform 함수는 0.5에서 1.5 사이의 난수를 생성한다 -> +50%, -50%의 변동을 줌.   
값의 비율을 적당히 조정하면서, 원래 값의 의미를 크게 벗어나지 않도록 한다.

size=len(df)를 통해 데이터프레임의 행 수와 동일한 길이의 난수 배열을 생성한다.   

## Analyzer

In [213]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

class Analyzer:
    def __init__(self, result_df, rates_df):
        self.result_df = result_df
        self.rates_df = rates_df

    def calculate_metrics(self):
        """Calculate MAE, MSE, RMSE, and MAPE for the result DataFrame."""
        metrics = (
            self.rates_df.merge(self.result_df, on="movie")[["rate", "rate_random"]]
            .pipe(lambda df: {
                "MAE": np.mean(np.abs(df["rate"] - df["rate_random"])),
                "MSE": np.mean((df["rate"] - df["rate_random"]) ** 2),
                "RMSE": np.sqrt(np.mean((df["rate"] - df["rate_random"]) ** 2)),
                "MAPE": np.mean((np.abs(df["rate"] - df["rate_random"]) / np.abs(df["rate"]))) * 100,
            })
        )
        return metrics
    
    def calculate_sklearn_metrics(self):
        """Calculate metrics using sklearn for comparison."""
        merged_df = self.rates_df.merge(self.result_df, on="movie")
        actual = merged_df["rate"]
        predicted = merged_df["rate_random"]
    
        mae = mean_absolute_error(actual, predicted)
        mse = mean_squared_error(actual, predicted)
        rmse = np.sqrt(mse)  # Direct RMSE calculation
        mape = mean_absolute_percentage_error(actual, predicted) * 100
    
        return {"MAE_sklearn": mae, "MSE_sklearn": mse, "RMSE_sklearn": rmse, "MAPE_sklearn": mape}

    def display_results(self):
        """Display metrics and comparison with sklearn metrics."""
        metrics = self.calculate_metrics()
        sklearn_metrics = self.calculate_sklearn_metrics()

        print("Analysis Metrics:")
        for metric, value in metrics.items():
            print(f"{metric}: {value:.4f}")

        print("\nSklearn Metrics:")
        for metric, value in sklearn_metrics.items():
            print(f"{metric}: {value:.4f}")

In [214]:
recommender = RandomRecommender(movies_df, rates_df)
result_df = recommender.run(n=1)
analyzer = Analyzer(result_df, rates_df)
analyzer.display_results()

Analysis Metrics:
MAE: 4.6258
MSE: 21.3981
RMSE: 4.6258
MAPE: 46.2581

Sklearn Metrics:
MAE_sklearn: 4.6258
MSE_sklearn: 21.3981
RMSE_sklearn: 4.6258
MAPE_sklearn: 46.2581


In [215]:
recommender = RandomRecommender(movies_df, rates_df)
result_df = recommender.run(n=10)
analyzer = Analyzer(result_df, rates_df)
analyzer.display_results()

Analysis Metrics:
MAE: 1.4208
MSE: 2.7234
RMSE: 1.6503
MAPE: 14.9640

Sklearn Metrics:
MAE_sklearn: 1.4208
MSE_sklearn: 2.7234
RMSE_sklearn: 1.6503
MAPE_sklearn: 14.9640


In [216]:
recommender = RandomRecommender(movies_df, rates_df)
result_df = recommender.run(n=10, variation=0)
analyzer = Analyzer(result_df, rates_df)
analyzer.display_results()

Analysis Metrics:
MAE: 0.1288
MSE: 0.0644
RMSE: 0.2538
MAPE: 1.3600

Sklearn Metrics:
MAE_sklearn: 0.1288
MSE_sklearn: 0.0644
RMSE_sklearn: 0.2538
MAPE_sklearn: 1.3600


In [217]:
recommender = RandomRecommender(movies_df, rates_df)
result_df = recommender.run(n=10, variation=80)
analyzer = Analyzer(result_df, rates_df)
analyzer.display_results()

Analysis Metrics:
MAE: 3.0676
MSE: 12.1362
RMSE: 3.4837
MAPE: 36.1307

Sklearn Metrics:
MAE_sklearn: 3.0676
MSE_sklearn: 12.1362
RMSE_sklearn: 3.4837
MAPE_sklearn: 36.1307


In [218]:
recommender = RandomRecommender(movies_df, rates_df)
result_df = recommender.run(n=30)
analyzer = Analyzer(result_df, rates_df)
analyzer.display_results()

Analysis Metrics:
MAE: 0.8613
MSE: 2.8199
RMSE: 1.6792
MAPE: 30.3617

Sklearn Metrics:
MAE_sklearn: 0.8613
MSE_sklearn: 2.8199
RMSE_sklearn: 1.6792
MAPE_sklearn: 30.3617


In [219]:
recommender = RandomRecommender(movies_df, rates_df)
result_df = recommender.run(n=50)
analyzer = Analyzer(result_df, rates_df)
analyzer.display_results()

Analysis Metrics:
MAE: 1.0101
MSE: 3.5320
RMSE: 1.8793
MAPE: 39.7740

Sklearn Metrics:
MAE_sklearn: 1.0101
MSE_sklearn: 3.5320
RMSE_sklearn: 1.8793
MAPE_sklearn: 39.7740


In [220]:
recommender = RandomRecommender(movies_df, rates_df)
result_df = recommender.run(n=100)
analyzer = Analyzer(result_df, rates_df)
analyzer.display_results()

Analysis Metrics:
MAE: 1.0606
MSE: 3.5590
RMSE: 1.8865
MAPE: 39.3681

Sklearn Metrics:
MAE_sklearn: 1.0606
MSE_sklearn: 3.5590
RMSE_sklearn: 1.8865
MAPE_sklearn: 39.3681


In [221]:
recommender = RandomRecommender(movies_df, rates_df)
result_df = recommender.run(n=500)
analyzer = Analyzer(result_df, rates_df)
analyzer.display_results()

Analysis Metrics:
MAE: 1.2737
MSE: 3.9548
RMSE: 1.9887
MAPE: 41.0228

Sklearn Metrics:
MAE_sklearn: 1.2737
MSE_sklearn: 3.9548
RMSE_sklearn: 1.9887
MAPE_sklearn: 41.0228


In [222]:
recommender = RandomRecommender(movies_df, rates_df)
result_df = recommender.run(n=1000)
analyzer = Analyzer(result_df, rates_df)
analyzer.display_results()

Analysis Metrics:
MAE: 1.3065
MSE: 4.1138
RMSE: 2.0282
MAPE: 42.7856

Sklearn Metrics:
MAE_sklearn: 1.3065
MSE_sklearn: 4.1138
RMSE_sklearn: 2.0282
MAPE_sklearn: 42.7856


In [223]:
recommender = RandomRecommender(movies_df, rates_df)
result_df = recommender.run(n=1000, variation=80)
analyzer = Analyzer(result_df, rates_df)
analyzer.display_results()

Analysis Metrics:
MAE: 1.3156
MSE: 4.1544
RMSE: 2.0382
MAPE: 42.9238

Sklearn Metrics:
MAE_sklearn: 1.3156
MSE_sklearn: 4.1544
RMSE_sklearn: 2.0382
MAPE_sklearn: 42.9238
