# Import

In [1]:
import json
import pandas as pd

# Constant

In [2]:
CLEANED_DATA_PATH = '/kaggle/input/rta-dubai/sample_clean_application_data_2.xlsx'
UNCLEANED_DATA_PATH = '/kaggle/input/rta-dubai/sample_unclean_application_data_v2.xlsx'
METADATA_PATH = '/kaggle/input/rta-dubai/Application dataset.json'

# Models

In [3]:
class MetaData:
    def __init__(self, data):
        self.__dict__ = data

# Helper Function

In [4]:
def load_json_and_map_to_class(json_path):
    try:
        with open(json_path, 'r') as json_file:
            data = json.load(json_file)
            return MetaData(data)
    except json.JSONDecodeError:
        print(f"Error: Unable to decode JSON from file '{file_path}'. File may be empty or not in valid JSON format.")
        return None

# Open AI Utils

In [5]:
def map_column_names_wrt_meta_data(dataset_column_names):
    '''
-----------------------------------------------------------------------------
params:
dataset_column_names: unclean_dataset_column_names
columns_meta_data_json: column attributes objects extracted from meta data
-----------------------------------------------------------------------------
<OPEN AI CALL WILL GO HERE>
    
Task: Consider below defined 'dataset_column_names'. Look up the 'columns_meta_data_json' and pick relevant column attribute_name w.r.t to the attribute description and synonyms context. Output should be a json with key as 'dataset_column_names' and value as attribute_name picked from columns_meta_data_json

dataset_column_names: <COLUMN NAMES WOULD GO HERE...>
columns_meta_data_json: 
[<ATTRIBUTE OBJECT(Name, Desc, Syn)>, <ATTRIBUTE OBJECT(Name, Desc, Syn)>, ...]

Output Example: {'Old Column Name'': "New Column Name"}
    '''
    return {
    "Name": "Name",
    "Short Name": "Short Name",
    "Description": "Description",
    "Business Owner": "Business Owner",
    "Technical Owner": "Technical Owner",
    "# users": "Number of Users",
    "RPO": "Recovery Point Objective",
    "RTO": "Recovery Time Objective",
    "start date": "Go Live Date",
    "end date": "End of Support Date",
    "SLA": "SLA Type",
    "Vendor": "Vendor",
    "Status": "Status",
    "AGF score": "AGF Classification",
    "DR TYpe": "DR Type",
    "Required Availability": "Required Availability",
    "location": "System Hosting Place",
    "I/X": "Mobile Compliance",
    "Type of System (Gartner)": "Type of System",
    "Across RTA?": "User Community",
    "commercial or customized": "Development Type",
    "Recommendation": "Recommendation",
    "Cloud Migration plan": "Cloud Migration Strategy",
    "Type": "Architecture Type",
    "Mobile compatible": "Mobile Compliance",
    "support many langs": "Multi Language Support"
}

# Pipeline

1. Extract **unclean dataset column names** and **attributes objects** from the meta-data.
2. Use Open-AI to map **column new names** w.r.t to the provided meta-data.
3. Change columns names.

# Code

In [6]:
df_cleaned = pd.read_excel(CLEANED_DATA_PATH)
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 27 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Name                      100 non-null    object        
 1   Short Name                100 non-null    object        
 2   Description               100 non-null    object        
 3   Business Owner            100 non-null    object        
 4   Technical Owner           100 non-null    object        
 5   Number of Users           56 non-null     object        
 6   Recovery Point Objective  26 non-null     object        
 7   Recovery Time Objective   27 non-null     object        
 8   Go Live Date              57 non-null     datetime64[ns]
 9   End of Support Date       58 non-null     datetime64[ns]
 10  SLA Type                  33 non-null     object        
 11  Vendor                    100 non-null    object        
 12  Status                 

In [7]:
df_uncleaned = pd.read_excel(UNCLEANED_DATA_PATH)
df_uncleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Name                      100 non-null    object        
 1   Short Name                100 non-null    object        
 2   Description               100 non-null    object        
 3   Business Owner            100 non-null    object        
 4   Technical Owner           100 non-null    object        
 5   # users                   56 non-null     object        
 6   RPO                       26 non-null     object        
 7   RTO                       27 non-null     object        
 8   start date                57 non-null     datetime64[ns]
 9   end date                  58 non-null     datetime64[ns]
 10  SLA                       33 non-null     object        
 11  Vendor                    100 non-null    object        
 12  Status                 

### 1. Conversion of un-cleaned data column names w.r.t to the metadata context
1. Extract **unclean dataset column names** and **attributes objects** from the meta-data.
2. Use Open-AI to map **column new names** w.r.t to the provided meta-data.
3. Change columns names.

In [8]:
# Extract old column names
uncleaned_column_old_names = ','.join(df_uncleaned.columns)

# Use Open-Ai to map new column names w.r.t to context
uncleaned_column_new_names = map_column_names_wrt_meta_data(uncleaned_column_old_names)

# Rename columns
df_uncleaned.rename(columns=uncleaned_column_new_names, inplace=True)

In [9]:
df_uncleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Name                      100 non-null    object        
 1   Short Name                100 non-null    object        
 2   Description               100 non-null    object        
 3   Business Owner            100 non-null    object        
 4   Technical Owner           100 non-null    object        
 5   Number of Users           56 non-null     object        
 6   Recovery Point Objective  26 non-null     object        
 7   Recovery Time Objective   27 non-null     object        
 8   Go Live Date              57 non-null     datetime64[ns]
 9   End of Support Date       58 non-null     datetime64[ns]
 10  SLA Type                  33 non-null     object        
 11  Vendor                    100 non-null    object        
 12  Status                 