In [1]:
import json
import pandas as pd

class EPKG():
    def __init__(self):
        self.data = None
        self.df = None
        self.columns = [
               'GUID', 'Name', 'ReportName', 'Title',
               'YearOfPublication', 
               'LinkingUri', 
               'LinkingUrl', 
               'ServerRelativeUrl', 
               'ServerRedirectedEmbedUri',
               'ServerRedirectedEmbedUrl', 

               'Created', 
               'Length', 

               'Basin', 'Block', 'WellName', 
               'ReportAuthor',
               'Contractors', 

               'TypeofFormat', 'TypesOfReport', 'StatusOfReport', 'Fields',

               'Function', 'TypeOfFile',
               ]
        self.accepted_ext = [
         'DOC', 'DOCX', 'PDF', 'XLS', 'doc', 'docx', 'pdf', 'plt', 'ppt', 'pptx', 'txt', 'xls', 'xlsx',
        ]
        self.load_data()
        self.filter_data()
        self.file_entity()
        self.convert_dict2list()
        self.create_single_entity_df()

    def load_data(self, file_path:str='data/raw/all_data.json'):
        with open(file_path) as f:
            self.data = json.load(f)
            self.df = pd.DataFrame(self.data).T.reset_index()
            self.df = self.df[self.columns]

    def filter_data(self):
        self.df = self.df[(self.df['Name'].str.split('.').str[-1]).isin(self.accepted_ext)]
        self.df = self.df.reset_index(drop=True)
        
# File Entity
    def file_entity(self):
        selected_columns = ['Name', 'GUID', 'ReportName', 'Title', 'YearOfPublication', 
                 'LinkingUri', 'LinkingUrl', 'ServerRelativeUrl', 'ServerRedirectedEmbedUri', 'ServerRedirectedEmbedUrl',
                 'Created', 'Length', 
                 ]
        self.file_entities =  self.df[selected_columns]

    def create_single_entity_df(self):
        # Basin Entity
        self.basin_cols = ['Name', 'GUID', 'Basin']
        self.basin_df = self.df[self.basin_cols].explode('Basin')
        self.unique_basin = self.basin_df['Basin'].unique()
        # Block Entity
        self.block_cols = ['Name', 'GUID', 'Block', 'Basin']
        self.block_df = self.df[self.block_cols].explode('Block')
        self.unique_block = self.block_df['Block'].unique()
        # Well Entity
        self.well_cols = ['Name', 'GUID', 'WellName', 'Block', 'Basin']
        self.well_df = self.df[self.well_cols].explode('WellName')
        self.unique_well = self.well_df['WellName'].unique()
        # Author Entity
        self.author_cols = ['Name', 'GUID', 'ReportAuthor']
        self.author_df = self.df[self.author_cols].explode('ReportAuthor')
        for i in range(len(self.author_df)):
            # print(type(self.author_df['ReportAuthor'][i]))
            if self.author_df['ReportAuthor'][i] is not None and ' - ' in self.author_df['ReportAuthor'][i]:
                self.author_df['ReportAuthor'][i] = self.author_df['ReportAuthor'][i].split(' - ')
            if self.author_df['ReportAuthor'][i] in ['113-BD-1X', '107-PL-1X']:
                self.author_df['ReportAuthor'][i] = None
            if self.author_df['ReportAuthor'][i] is not None and ' - ' in self.author_df['ReportAuthor'][i]:
                self.author_df['ReportAuthor'][i] = self.author_df['ReportAuthor'][i].split(' - ')
        self.author_df = self.author_df.explode('ReportAuthor')
        self.unique_author = self.author_df['ReportAuthor'].unique()
        # Contractor Entit
        self.contractor_cols = ['Name', 'GUID', 'Contractors']
        self.contractor_df = self.df[self.contractor_cols].explode('Contractors')
        self.unique_contractor = self.contractor_df['Contractors'].unique()
        # TypeofFormat Entity
        self.type_of_format_cols = ['Name', 'GUID', 'TypeofFormat']
        self.type_of_format_df = self.df[self.type_of_format_cols].explode('TypeofFormat')
        self.unique_type_of_format = self.type_of_format_df['TypeofFormat'].unique()
        # TypesOfReport Entity
        self.types_of_report_cols = ['Name', 'GUID', 'TypesOfReport']
        self.types_of_report_df = self.df[self.types_of_report_cols].explode('TypesOfReport')
        self.unique_types_of_report = self.types_of_report_df['TypesOfReport'].unique()
        # StatusOfReport Entity
        self.status_of_report_cols = ['Name', 'GUID', 'StatusOfReport']
        self.status_of_report_df = self.df[self.status_of_report_cols].explode('StatusOfReport')
        self.unique_status_of_report = self.status_of_report_df['StatusOfReport'].unique()
        # Fields Entity
        self.fields_cols = ['Name', 'GUID', 'Fields']
        self.fields_df = self.df[self.fields_cols].explode('Fields')
        self.unique_fields = self.fields_df['Fields'].unique()
        # Function Entity
        self.function_cols = ['Name', 'GUID', 'Function']
        self.function_df = self.df[self.function_cols].explode('Function')
        self.unique_function = self.function_df['Function'].unique()
        # TypeOfFile Entity
        self.type_of_file_cols = ['Name', 'GUID', 'TypeOfFile']
        self.type_of_file_df = self.df[self.type_of_file_cols].explode('TypeOfFile')
        self.unique_type_of_file = self.type_of_file_df['TypeOfFile'].unique()

    def convert_dict2list(self):
        for col_name in ['Basin', 'Block', 'WellName', 'ReportAuthor', 'Contractors', 'TypeofFormat', 'TypesOfReport', 'StatusOfReport', 'Fields', 'Function', 'TypeOfFile']:
            # Check if column is a dictionary
            if isinstance(self.df[col_name][0], dict):
                self.__convert_dict2list(col_name)

    def __convert_dict2list(self, col_name:str):
        print(f"Converting {col_name} to list")
        column = self.df[col_name]

        if col_name == "Basin":
            fixed_value = list(column[0].values())

        for i, col_value in enumerate(column):
            if col_value is None and col_name == "Basin":
                column[i] = fixed_value
            elif col_value is None and col_name != "Basin":
                column[i] = []
            else:
                column[i] = list(column[i].values())

        self.df[col_name] = column

    def __save_csv(self, data:pd.DataFrame, file_path:str):
        data.to_csv(file_path, index=False)

In [2]:
EPProcessing = EPKG()
EPProcessing.unique_author

Converting Basin to list
Converting Block to list
Converting TypeofFormat to list
Converting TypesOfReport to list
Converting Fields to list


array(['CCOP', None, 'Nguyen Thanh Tung', 'Nguyen Tien Thinh',
       'Chu Duc Quang',
       'Nguyen The Hung, Luu Thanh Hung va nhung nguoi khac',
       'Nguyen Thi Dau', 'VPI Labs', 'Do Bat', 'Nguyen Manh Hung',
       'Nguyen Thi Tham', 'Bui Thi Ngoc Phuong',
       'Edi W. Jatmiko and Tidar A.B.Nurgrojo', 'Ha Quoc Quan',
       'Nguyen Huy Ngoc, Jamin Jamil  Bin Mohd Idris', 'Le Quang Chung',
       'Roszendy b.Danial', 'Mohd Zafuan Che Zulkifli',
       'International logging', 'Norkhairil B Mohamad',
       'Jamin Jamin Bin Mohd Idris', 'PVEP', 'International Logging',
       'VAST', 'Bui  Thi Ngoc Phuong', 'Hoang Manh Tan',
       'Nguyen Hong Minh and Hoang Manh Tan', 'Nguyen Van Giap',
       'Nguyen Trong Tri', 'Nguyen Duc Hung', 'Nguyen Dac The',
       'Nguyen Thi Bich Ha',
       'Vu Tien Lang, Cao Duc Thang, Nguyen Ngoc Minh, Nguyen Duc Hung, Dinh Van Huy',
       'EPC-VPI', 'PTSC Marine/OGS', 'Luu Khac Thieu', 'Pham Van Tuan',
       'Hoang Van Thach', 'J.L. PITTION',
