In [1]:
def main(csvfile):
    data, headers = read_csv_data(csvfile)
    country_info = calculate_country_info(data, headers)
    category_info = calculate_category_info(data, headers)
    return country_info, category_info

def read_csv_data(csvfile):
    try:
        with open(csvfile, 'r', encoding='utf-8') as file:
            lines = file.readlines()
    except FileNotFoundError:
        print("File not found: " + csvfile)
        return [], []
    
    headers = [header.lower() for header in lines[0].strip().split(',')]
    data = []

    for line in lines[1:]:
        values = line.strip().split(',')
        if len(values) == len(headers):
            data.append(values)
    
    return data, headers

def calculate_country_info(data, headers):
    country_profits_2020 = {}
    country_profits_2021 = {}
    country_minkowski_data = {}

    country_idx = headers.index('country')
    profit2020_idx = headers.index('profits in 2020(million)')
    profit2021_idx = headers.index('profits in 2021(million)')
    employees_idx = headers.index('number of employees')
    median_salary_idx = headers.index('median salary')

    for values in data:
        country = values[country_idx]
        profit2020 = float(values[profit2020_idx])
        profit2021 = float(values[profit2021_idx])
        employees = int(values[employees_idx])
        median_salary = float(values[median_salary_idx])

        # Calculate t-test scores for profits
        if country in country_profits_2020:
            country_profits_2020[country].append(profit2020)
            country_profits_2021[country].append(profit2021)
        else:
            country_profits_2020[country] = [profit2020]
            country_profits_2021[country] = [profit2021]

        # Calculate Minkowski distance between employees and median salary
        if country in country_minkowski_data:
            country_minkowski_data[country].append(abs(employees - median_salary))
        else:
            country_minkowski_data[country] = [abs(employees - median_salary)]

    country_info = {}
    for country in country_profits_2020:
        t_test_score = calculate_t_test(country_profits_2020[country], country_profits_2021[country])
        minkowski_distance = calculate_minkowski_distance(country_minkowski_data[country])
        country_info[country] = [round(t_test_score, 4), round(minkowski_distance, 4)]

    return country_info

def calculate_category_info(data, headers):
    category_data = {}
    category_idx = headers.index('category')
    id_idx = headers.index('organisation id')
    employees_idx = headers.index('number of employees')
    profit2020_idx = headers.index('profits in 2020(million)')
    profit2021_idx = headers.index('profits in 2021(million)')

    for values in data:
        category = values[category_idx]
        org_id = values[id_idx]
        employees = int(values[employees_idx])
        profit2020 = float(values[profit2020_idx])
        profit2021 = float(values[profit2021_idx])

        if category in category_data:
            category_data[category][org_id] = [employees, abs((profit2021 - profit2020) / profit2020 * 100), 0]
        else:
            category_data[category] = {org_id: [employees, abs((profit2021 - profit2020) / profit2020 * 100), 0]}

    for category, org_data in category_data.items():
        sorted_orgs = sorted(org_data.items(), key=lambda x: (-x[1][0], -x[1][1]))
        rank = 1
        for org_id, data in sorted_orgs:
            data[2] = rank
            rank += 1

    return category_data

def calculate_t_test(sample1, sample2):
    mean1 = sum(sample1) / len(sample1)
    mean2 = sum(sample2) / len(sample2)
    std1 = (sum((x - mean1) ** 2 for x in sample1) / (len(sample1) - 1)) ** 0.5
    std2 = (sum((x - mean2) ** 2 for x in sample2) / (len(sample2) - 1)) ** 0.5
    t_score = (mean1 - mean2) / ((std1 / len(sample1)) + (std2 / len(sample2)) ** 0.5)
    return t_score

def calculate_t_test(sample1, sample2):
    mean1 = sum(sample1) / len(sample1)
    mean2 = sum(sample2) / len(sample2)
    std1 = (sum((x - mean1) ** 2 for x in sample1) / (len(sample1) - 1)) ** 0.5
    std2 = (sum((x - mean2) ** 2 for x in sample2) / (len(sample2) - 1)) ** 0.5
    t_score = (mean1 - mean2) / ((std1 ** 2 / len(sample1) + std2 ** 2 / len(sample2)) ** 0.5)
    return t_score

def calculate_minkowski_distance(data):
    p = 3  # Use p =3 in this project.
    return sum((abs(x) ** p for x in data)) ** (1 / p)


In [2]:
country_info, category_info = main('Organisations.csv')

In [3]:
country_info

{'afghanistan': [0.1607, 4400.639],
 'albania': [0.7401, 1784.3519],
 'algeria': [-12.8855, 9454.9113],
 'angola': [-4.0404, 4883.3185],
 'anguilla': [-15.6868, 7327.9277],
 'argentina': [-8.4948, 8251.7186],
 'australia': [-7.2305, 8539.3766],
 'austria': [-0.7002, 7707.5591],
 'bahamas': [-9.2531, 6331.6231],
 'bahrain': [-1.8167, 4734.8595],
 'bangladesh': [-0.8906, 3085.9946],
 'belgium': [-15.8825, 9901.866],
 'botswana': [-4.1164, 9427.1852],
 'brazil': [-3.5173, 10174.3314],
 'brunei darussalam': [-7.0044, 4673.7072],
 'bulgaria': [-8.8506, 8673.1453],
 'cambodia': [-4.1852, 7709.4533],
 'cameroon': [-0.3548, 2244.7215],
 'canada': [-10.7296, 9092.6039],
 'chad': [-1.8277, 5772.3104],
 'chile': [-2.2403, 8162.1705],
 'china': [-0.4694, 10448.2507],
 'colombia': [-5.1484, 2972.4644],
 'comoros': [-2.9222, 5345.4113],
 'congo': [-5.22, 8852.8543],
 'cook islands': [-7.1385, 2408.9416],
 'costa rica': [-3.1319, 4050.3388],
 "cote d'ivoire": [-2.7793, 6123.6504],
 'croatia': [-0.119