ads1_assignement3.py

#!/usr/bin/env python3# -*- coding: utf-8 -*-"""Created on Thu Nov 30 17:43:06 2023@author: tayssirboukrouba"""import pandas as pdimport matplotlibimport matplotlib.pyplot as pltimport numpy as npimport sklearn.preprocessing as ppimport sklearn.cluster as clusterimport sklearn.metrics as skmetfrom scipy.optimize import curve_fitimport warningsimport seaborn as snsimport errors as err# Suppress a specific warningwarnings.filterwarnings("ignore")def read_and_transpose(filename):    '''    Reads a filename of csv dataframe and returns 2 dataframes            Parameters:                   * filename (String): a string of filename source             Returns:                   * df (DataFrame) : original dataframe                   * transp_df (DataFrame) : tranposed version of the original    '''    # reading the data    df = pd.read_csv(filename)    # cleaning the data    df.fillna(method='bfill', inplace=True)    df.dropna(inplace=True)    df.drop(columns=['Series Code', 'Country Code'], inplace=True)    # Generate a mapping dictionary for renaming    column_mapping = {old_col: str(year) for old_col, year in zip(        df.columns[2:], range(1980, 2020))}    # Rename the columns using the mapping dictionary    df = df.rename(columns=column_mapping)    # renaming indicators    old_vals = df['Series Name'].unique()    new_vals = ['Population', 'GDP', 'Inflation',                'Debt', 'ANIPC', 'Exchange Rate']    df['Series Name'] = df['Series Name'].replace({        old_vals[0]: new_vals[0],        old_vals[1]: new_vals[1],        old_vals[2]: new_vals[2],        old_vals[3]: new_vals[3],        old_vals[4]: new_vals[4],        old_vals[5]: new_vals[5]})    #df = df.set_index(['Series Name', 'Country Name'])    columns_to_convert = [str(year) for year in range(1980, 2020)]    for column in columns_to_convert:        df[column] = pd.to_numeric(df[column], errors='coerce').astype(float)    # creating the transposed dataframe    trans_df = pd.DataFrame.transpose(df)    header = trans_df.iloc[0].values.tolist()    trans_df.columns = header    trans_df = trans_df.iloc[1:]    transp_df = trans_df.apply(pd.to_numeric, errors='coerce')    # returning the dataframes    return df, transp_dfdef preprocessing(df, ind):    """    Preprocesses a dataframe for a specific indicator.    Parameters:        * df (DataFrame): a pandas DataFrame containing the data.        * ind (str): the indicator to be processed.    Returns:        * df (DataFrame): a DataFrame containing processed data for the             specified indicator.        * norm (array): a numpy array containing the normalized data.        * scaler (RobustScaler): a sklearn RobustScaler object fitted to the             processed data.    """    # Slicing & Processing    mask = df['Series Name'].isin([ind])    inc_pc = df[mask].drop('Series Name', axis=1)    growth = inc_pc[["Country Name", "1980"]].copy()    diff = inc_pc["2019"]-inc_pc["1980"]    growth["Growth"] = 100.0/39.0 * (diff) / inc_pc["1980"]    # Scalling    df = growth[["1980", "Growth"]]    scaler = pp.RobustScaler()    scaler.fit(df)    # apply the scaling    norm = scaler.transform(df)    return df, norm, scalerdef silhouette(norm_df, min_clust, max_clust):    """    Calculates silhouette scores for a normalized dataframe across a range     of cluster numbers using KMeans clustering.    Parameters:        * norm_df (DataFrame): a pandas DataFrame containing normalized data            for clustering.        * min_clust (int): the minimum number of clusters to consider.        * max_clust (int): the maximum number of clusters to consider.    Returns:        * scores_list (list): a list of silhouette scores corresponding to         each cluster number.    """    scores_list = {}    for ic in range(min_clust, max_clust+1):        # set up the clusterer with the number of expected clusters        kmeans = cluster.KMeans(n_clusters=ic, n_init=20)        # Fit the data, results are stored in the kmeans object        kmeans.fit(norm_df)  # fit done on x,y pairs        labels = kmeans.labels_        # calculate the silhoutte score        score = (skmet.silhouette_score(norm_df, labels))        scores_list[str(ic)] = str(score)        print(f"The silhouette score for {ic: 3d} is {score: 7.4f}")    return scores_list.values()def clustering(df, nc, inertia, scaler):    """    Performs KMeans clustering on a DataFrame and returns cluster centers     and labels.    Parameters:        * df (DataFrame): a pandas DataFrame containing the data.        * nc (int): the number of clusters to form.        * inertia (int): the number of times KMeans algorithm will be run with            different centroid seeds.        * scaler (RobustScaler): a sklearn RobustScaler object fitted to             the data.    Returns:        * xcen (array): an array containing x-coordinates of the cluster             centers.        * ycen (array): an array containing y-coordinates of the cluster             centers.        * labels (array): an array containing cluster labels for each data             point.    """    # set up the clusterer with the number of expected clusters    kmeans = cluster.KMeans(n_clusters=nc, n_init=inertia)    # Fit the data, results are stored in the kmeans object    kmeans.fit(df)  # fit done on x,y pairs    # extract cluster labels    labels = kmeans.labels_    # extract the estimated cluster centres and convert to original scales    cen = kmeans.cluster_centers_    cen = scaler.inverse_transform(cen)    xcen = cen[:, 0]    ycen = cen[:, 1]    return xcen, ycen, labelsdef clusterplot(data, year, col, xcen, ycen, labels, xlabel, ylabel, title):    """    Generates a cluster plot for a specified dataset with cluster information.    Parameters:        * data (DataFrame): A pandas DataFrame containing the data.        * year (int): The specific year for which the data will be plotted.        * col (str): The column to be used for y-axis.        * xcen (array-like): An array containing x-coordinates of the cluster            centers.        * ycen (array-like): An array containing y-coordinates of the cluster            centers.        * labels (array-like): An array containing cluster labels for each            data point.        * xlabel (str): The label for the x-axis.        * ylabel (str): The label for the y-axis.        * title (str): The title of the cluster plot.    Returns:        * None: The function generates and displays the cluster plot             using Matplotlib.    """    # defining our data    x = df[year]    y = df[col]    plt.figure(figsize=(8,5))    # plotting scatter plot    cm = matplotlib.colormaps["Paired"]    plt.scatter(x, y, 30, lab, marker="o", cmap=cm)    # show cluster centres    plt.scatter(xcen, ycen, 60, "k", marker="d", label='Kmeans-centers')    plt.scatter(xcen, ycen, 45, "y", marker="+", label="original centres")    plt.xlabel(xlabel)    plt.ylabel(ylabel)    plt.title(title)    plt.legend(facecolor='white')    plt.xlim(2)    plt.ylim(2)def fitting(data, indicator, country, error=False):    """    Fits a logistic curve to time-series data and optionally plots the result    with error range.    Parameters:        * data (DataFrame): a pandas DataFrame containing time-series data.        * indicator (str): the indicator for which the curve will be fitted.        * country (str): the country for which the curve will be fitted.        * error (bool, optional): whether to plot the error range.            Default is False.    Returns:        * Original figure : if error is False.        * Error figure: if error is True    """    # defining our data    data = data.set_index(['Series Name', 'Country Name'])    data = data.loc[indicator]    xdata = np.array([float(year) for year in range(1980, 2020)])    ydata = data.loc[country]    # defining our logistic function    def logistic(t, n0, g, t0): return n0 / (1 + np.exp(-g * (t - t0)))    param, covar = curve_fit(        logistic, xdata, ydata, p0=(3e12, 0.1, 1980))    year = np.linspace(1980, 2030, 40)    forecast = logistic(year, *param)    sigma = err.error_prop(year, logistic, param, covar)    up = forecast + sigma    low = forecast - sigma        #2030 GDP Prediction     number = forecast[-1]     plt.figure(figsize=(10,6))    plt.plot(xdata, ydata, label='Original Data')    plt.plot(year, forecast, label='Best Fit (Logestic)', linestyle='--')    plt.xlabel('Year')    plt.ylabel('U.S. GDP')    plt.title('Evolution of US GDP Across Years')    plt.text(2010, 3e12, f' 2030 Prediction : {number / 1e12:.3f} trillion'              f'\n\n Uncertainty Range: [{low[-1] / 1e12:.3f}'             f'{up[-1] / 1e12:.3f}] trillion',fontsize=10, color='Blue')    if error:        plt.fill_between(year, low, up, color='yellow',                         alpha=0.5, label='Error Range')    plt.legend(facecolor='white')def lineplot(df, indicator, countries, years, title):    """    Generates a line plot for a specified indicator, countries, and years.    Parameters:        * df (DataFrame): A pandas DataFrame containing the data.        * indicator (str): The indicator for which the line plot will be            generated.        * countries (list): A list of country names to include in the plot.        * years (list): A list of years to include in the plot.        * title (str): The title of the line plot.    Returns:        * None: The function generates and displays the line plot using            Matplotlib.    """    deta = df.loc[indicator]    deta = deta.loc[countries]    deta.dropna(inplace=True, axis=1)    x = deta.columns.tolist()    fig, ax = plt.subplots(figsize=(10, 7))    #cstm = [str(year) for year in range(1990,2019,3)]    for country in countries:        ax.plot(x, deta.loc[country], label=country)        ax.legend(facecolor='white')        ax.set_xticks(years)        ax.axhline(0, color='red', linestyle='--')    ax.set_xlabel('Years')    plt.title('Inflation Rates Per Country (annual %)')    plt.ylabel('Rate')def horizentalbar(df, indicator, countries, years, xlabel, title):    """    Generates a horizontal bar plot for a specified indicator, countries,     and years.    Parameters:        * df (DataFrame): A pandas DataFrame containing the data.        * indicator (str): The indicator for which the horizontal bar plot            will be generated.        * countries (list): A list of country names to include in the plot.        * years (list): A list of years to include in the plot.        * xlabel (str): The label for the x-axis.        * title (str) : The title for the hbar plot     Returns:        * None: The function generates and displays the horizontal bar plot            using Matplotlib.    """    my_data = df.loc['Debt']    df.dropna(inplace=True)    my_data = my_data.loc[countries]    my_data = my_data[years]    xlabel = xlabel    my_data.plot(kind='barh', xlabel=xlabel, figsize=(10, 6))    plt.legend(facecolor='white')    plt.title(title)def barplot(df, indicator, countries, ylabel, title):    """    Generates a bar plot for a specified indicator and countries.    Parameters:        * df (DataFrame): A pandas DataFrame containing the data.        * indicator (str): The indicator for which the bar plot will be             generated.        * countries (list): A list of country names to include in the plot.    Returns:        * None: The function generates and displays the bar plot using             Matplotlib.    """    deta = df.loc[indicator]    deta = deta.loc[countries]    deta = deta[years]    # plt.figure(figsize=(15,10))    deta.plot(kind='bar', stacked=True, figsize=(10, 6))    plt.ylabel(ylabel)    plt.title(title)    plt.legend(facecolor='white')def bubblechart(df, x, y, year, size_col, title):    """    Generates a bubble chart for a specified DataFrame and columns.    Parameters:        * df (DataFrame): A pandas DataFrame containing the data.        * x (str): The column to be used for the x-axis.        * y (str): The column to be used for the y-axis.        * year (str): The specific year for which the data will be plotted.        * size_col (str): The column to be used for the size of the bubbles.        * title (str): The title of the bubble chart.    Returns:        * None: The function generates and displays the bubble chart using             Seaborn and Matplotlib.    """    plt.figure(figsize=(7, 4))    # Transposing DataFrame    data = df.T    # Removing Outliers & Null Values    data = data[data[x] < 0.5e13]    data = data.loc[year]    data = data.reset_index()    data.drop(columns='Country Name', inplace=True)    data.set_index('Series Name', inplace=True)    data.dropna(inplace=True)    # Turning dataframe into series    data = data[year]    x = data.loc[x].values    y = data.loc[y].values    s = data.loc[size_col].values    # plotting data :    sns.scatterplot(x=y, y=x, size=s, sizes=(20, 200), alpha=0.7, hue=s)    plt.xlabel('GDP')    plt.ylabel('Population')    plt.legend(labels=['10K $', '20K $', '30K $',               '40K $', '50K $'], title='Income/Capita')    plt.title(title)# reading data :data, transp_df = read_and_transpose('data.csv')# preprocessing data :df, norm_df, scaler = preprocessing(data, 'ANIPC')# silhouette test :list_sc = silhouette(norm_df, 2, 10)# getting cluster center cordinates and labelsx_cent, y_cent, lab = clustering(norm_df, 3, 20, scaler)# defining scatter plot (cluster plot) variablesyear = "1980"y = "Growth"xlabel = "Income Per capita 1980"ylabel = "IPC growth/year [%]"title = "Clusters of Income/Capita Vs Annual Growth"# calling the clusterplot() function :clusterplot(df, year, y, x_cent, y_cent, lab, xlabel, ylabel, title)fitting(data, 'GDP', 'United States', True)# lineplotdata = data.set_index(['Series Name', 'Country Name'])countries = ['Germany', 'Gabon', 'Mali']years = [str(year) for year in range(1990, 2020, 5)]lineplot(data, 'Inflation', countries, years, 'test')# defining horizental bar plot variablescountries = ['Gabon', 'Mali', 'Turkiye', 'Kenya', 'Costa Rica']years = [str(year) for year in range(1989, 2020, 10)]xlabel = 'Total debt service (% of GNI)'title = "Total Debt For Poor & Developping Countries"# calling the horizentalbar() functionhorizentalbar(data, 'Debt', countries, years, xlabel, title)# defining barplot variablesindicator = 'Exchange Rate'countries1 = ['Burundi', 'Algeria', 'Rwanda', 'Chad', 'Gabon']countries2 = ['Germany', 'Qatar', 'Singapore']ylabel = 'Official Exchange Rate For $'title1 = 'Exchange Rate For Poor and Developping Countries Across Decades'title2 = 'Exchange Rate For Devolopped Countries Across Decades'# calling the barplot() functionbarplot(data, indicator, countries1, ylabel, title1)barplot(data, indicator, countries2, ylabel, title2)# defining bubble chart variablesx = 'GDP'y = 'Population's = 'ANIPC'year = '2016'title = 'Bubblechart of the effect of GDP/Population on IPC '# calling the bubblechart() functionbubblechart(data, x, y, year, s, title)