# Data clean up system 
Given the input folder 'raw_data', return the folder 'sample_data' containing csv file obtained from cleaning 
the raw data in the 'raw_data' folder

In [1]:
import pandas as pd
import os
from glob import glob
from itertools import chain
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

NameError: name 'assertElementsEqual' is not defined

## Exploratory data analysis
Check the content of the data, check for missing data

## Helper functions to clean raw data

In [7]:
def get_cleaned_user_data(input_folder, user_data_file):
    """
    Returns cleaned user data.
    Arguments:
        input_folder: the input folder name
        user_data_file: the input file name for user data
    Return:
        users: a pandas data frame
    """
    users = pd.read_json(f"{input_folder}/{user_data_file}", orient="split")
    if users.isnull().sum().sum() != 0:
        users.dropna(inplace=True)
    return users


def get_cleaned_ranking_data(input_folder, ranking_data_file):
    """
    Returns cleaned ranking data.
    Arguments:
        input_folder: the input folder name
        ranking_data_file: the input file name for ranking data
    Return:
        users: a pandas data frame
    """
    ranking = pd.read_csv(f"{input_folder}/{ranking_data_file}", \
                          sep="::",names=["BrukerID","FilmID","Rangering","Tidstempel"])
    if ranking.isnull().sum().sum() != 0:
        ranking.dropna(inplace=True)
    return ranking


def get_data_files_from(input_folder):
    files = glob(f"{input_folder}/*")
    base_names = list(map(lambda f: os.path.basename(f),files))
    data_file = [f for f in base_names if os.path.splitext(f)[1] in ['.json', '.xlsx', '.dat'] ]
    return data_file


def get_cleaned_film_data(input_folder, film_data_file):
    """
    Returns cleaned film data user data.
    Arguments:
        input_folder: the input folder name
        film_data_file: the input file name fr film data
    Return:
        users: a pandas data frame
    """
    df = pd.read_excel(f"{input_folder}/{film_data_file}", sheet_name='film', index_col=0)
    df.Sjanger = df.Sjanger.apply(lambda s: s.split('|'))
    df = df.sort_values(by=['FilmID'], ignore_index=True)
    df_id_title = df[['FilmID', 'Tittel']]
    
    all_genres = list(df.Sjanger)
    unique_genres  = sorted(list(set(list(chain(*all_genres)))))
    unique_genres_d = dict(map(lambda s: (s,[0 for _ in range(len(df))]),unique_genres))
    df_genre = pd.DataFrame(unique_genres_d) 
    df_genre.insert(loc=0, column="FilmID", value=list(df_id_title.FilmID))
    film_id_genre_map = dict( zip( list(df.FilmID), list(df.Sjanger) ))
    
    for column_name in unique_genres:
        df_genre[column_name] = [1 if column_name in film_id_genre_map[ID] else 0 for ID in film_id_genre_map.keys()]
    df_genre.drop(["FilmID","Ukjennt", "Children's"], axis=1, inplace=True)
    df_film = pd.concat([df_id_title, df_genre], axis=1)
    if df_film.isnull().sum().sum() != 0:
        df_film.dropna(inplace=True)
    return df_film


def save_csv_files(input_folder, output_folder):
    """
    Given an input folder name and an output folder name,
    save all cleaned data in a csv format, in the output 
    folder.
    Arguments:
        input_folder: the input folder
        output_folder: the output folder
    Return
        None
    """
    assert input_folder == "raw_data", "Input folder name must be 'raw_data'"
    config = {
        "film.xlsx": {"function": get_cleaned_film_data},
        "bruker.json": {"function": get_cleaned_user_data},
        "rangering.dat": {"function":get_cleaned_ranking_data},
    }
    data_files = get_data_files_from(input_folder)
    error_message = f"{input_folder} must contain: 'film.xlsx', 'bruker.json', 'rangering.dat'"
    assert sorted(data_files) == sorted(list(config.keys())), error_message
    
    for file_name in config:
        func = config[file_name]["function"]
        df = func(input_folder, file_name)
        new_name = file_name.split('.')[0]
        print(f"saving {new_name}.csv")
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        df.to_csv(f"{output_folder}/{new_name}.csv")

In [8]:
save_csv_files("raw_data", "sample_data1")

['bruker.json', 'film.xlsx', 'rangering.dat']
--------------
['film.xlsx', 'bruker.json', 'rangering.dat']
saving film.csv
saving bruker.csv
saving rangering.csv


In [None]:
def plot_missing_values(df, name):
    mis_values = pd.DataFrame(df.isnull().sum()).apply(lambda x: x/len(df))
    plt.bar(mis_values.index, mis_values[0])
    plt.ylabel('%')
    plt.title(f'Percentage of missing value for {name}')
    plt.show()