-
Notifications
You must be signed in to change notification settings - Fork 0
/
ads1_assignement3.py
470 lines (378 loc) · 15 KB
/
ads1_assignement3.py
1
#!/usr/bin/env python3# -*- coding: utf-8 -*-"""Created on Thu Nov 30 17:43:06 2023@author: tayssirboukrouba"""import pandas as pdimport matplotlibimport matplotlib.pyplot as pltimport numpy as npimport sklearn.preprocessing as ppimport sklearn.cluster as clusterimport sklearn.metrics as skmetfrom scipy.optimize import curve_fitimport warningsimport seaborn as snsimport errors as err# Suppress a specific warningwarnings.filterwarnings("ignore")def read_and_transpose(filename): ''' Reads a filename of csv dataframe and returns 2 dataframes Parameters: * filename (String): a string of filename source Returns: * df (DataFrame) : original dataframe * transp_df (DataFrame) : tranposed version of the original ''' # reading the data df = pd.read_csv(filename) # cleaning the data df.fillna(method='bfill', inplace=True) df.dropna(inplace=True) df.drop(columns=['Series Code', 'Country Code'], inplace=True) # Generate a mapping dictionary for renaming column_mapping = {old_col: str(year) for old_col, year in zip( df.columns[2:], range(1980, 2020))} # Rename the columns using the mapping dictionary df = df.rename(columns=column_mapping) # renaming indicators old_vals = df['Series Name'].unique() new_vals = ['Population', 'GDP', 'Inflation', 'Debt', 'ANIPC', 'Exchange Rate'] df['Series Name'] = df['Series Name'].replace({ old_vals[0]: new_vals[0], old_vals[1]: new_vals[1], old_vals[2]: new_vals[2], old_vals[3]: new_vals[3], old_vals[4]: new_vals[4], old_vals[5]: new_vals[5]}) #df = df.set_index(['Series Name', 'Country Name']) columns_to_convert = [str(year) for year in range(1980, 2020)] for column in columns_to_convert: df[column] = pd.to_numeric(df[column], errors='coerce').astype(float) # creating the transposed dataframe trans_df = pd.DataFrame.transpose(df) header = trans_df.iloc[0].values.tolist() trans_df.columns = header trans_df = trans_df.iloc[1:] transp_df = trans_df.apply(pd.to_numeric, errors='coerce') # returning the dataframes return df, transp_dfdef preprocessing(df, ind): """ Preprocesses a dataframe for a specific indicator. Parameters: * df (DataFrame): a pandas DataFrame containing the data. * ind (str): the indicator to be processed. Returns: * df (DataFrame): a DataFrame containing processed data for the specified indicator. * norm (array): a numpy array containing the normalized data. * scaler (RobustScaler): a sklearn RobustScaler object fitted to the processed data. """ # Slicing & Processing mask = df['Series Name'].isin([ind]) inc_pc = df[mask].drop('Series Name', axis=1) growth = inc_pc[["Country Name", "1980"]].copy() diff = inc_pc["2019"]-inc_pc["1980"] growth["Growth"] = 100.0/39.0 * (diff) / inc_pc["1980"] # Scalling df = growth[["1980", "Growth"]] scaler = pp.RobustScaler() scaler.fit(df) # apply the scaling norm = scaler.transform(df) return df, norm, scalerdef silhouette(norm_df, min_clust, max_clust): """ Calculates silhouette scores for a normalized dataframe across a range of cluster numbers using KMeans clustering. Parameters: * norm_df (DataFrame): a pandas DataFrame containing normalized data for clustering. * min_clust (int): the minimum number of clusters to consider. * max_clust (int): the maximum number of clusters to consider. Returns: * scores_list (list): a list of silhouette scores corresponding to each cluster number. """ scores_list = {} for ic in range(min_clust, max_clust+1): # set up the clusterer with the number of expected clusters kmeans = cluster.KMeans(n_clusters=ic, n_init=20) # Fit the data, results are stored in the kmeans object kmeans.fit(norm_df) # fit done on x,y pairs labels = kmeans.labels_ # calculate the silhoutte score score = (skmet.silhouette_score(norm_df, labels)) scores_list[str(ic)] = str(score) print(f"The silhouette score for {ic: 3d} is {score: 7.4f}") return scores_list.values()def clustering(df, nc, inertia, scaler): """ Performs KMeans clustering on a DataFrame and returns cluster centers and labels. Parameters: * df (DataFrame): a pandas DataFrame containing the data. * nc (int): the number of clusters to form. * inertia (int): the number of times KMeans algorithm will be run with different centroid seeds. * scaler (RobustScaler): a sklearn RobustScaler object fitted to the data. Returns: * xcen (array): an array containing x-coordinates of the cluster centers. * ycen (array): an array containing y-coordinates of the cluster centers. * labels (array): an array containing cluster labels for each data point. """ # set up the clusterer with the number of expected clusters kmeans = cluster.KMeans(n_clusters=nc, n_init=inertia) # Fit the data, results are stored in the kmeans object kmeans.fit(df) # fit done on x,y pairs # extract cluster labels labels = kmeans.labels_ # extract the estimated cluster centres and convert to original scales cen = kmeans.cluster_centers_ cen = scaler.inverse_transform(cen) xcen = cen[:, 0] ycen = cen[:, 1] return xcen, ycen, labelsdef clusterplot(data, year, col, xcen, ycen, labels, xlabel, ylabel, title): """ Generates a cluster plot for a specified dataset with cluster information. Parameters: * data (DataFrame): A pandas DataFrame containing the data. * year (int): The specific year for which the data will be plotted. * col (str): The column to be used for y-axis. * xcen (array-like): An array containing x-coordinates of the cluster centers. * ycen (array-like): An array containing y-coordinates of the cluster centers. * labels (array-like): An array containing cluster labels for each data point. * xlabel (str): The label for the x-axis. * ylabel (str): The label for the y-axis. * title (str): The title of the cluster plot. Returns: * None: The function generates and displays the cluster plot using Matplotlib. """ # defining our data x = df[year] y = df[col] plt.figure(figsize=(8,5)) # plotting scatter plot cm = matplotlib.colormaps["Paired"] plt.scatter(x, y, 30, lab, marker="o", cmap=cm) # show cluster centres plt.scatter(xcen, ycen, 60, "k", marker="d", label='Kmeans-centers') plt.scatter(xcen, ycen, 45, "y", marker="+", label="original centres") plt.xlabel(xlabel) plt.ylabel(ylabel) plt.title(title) plt.legend(facecolor='white') plt.xlim(2) plt.ylim(2)def fitting(data, indicator, country, error=False): """ Fits a logistic curve to time-series data and optionally plots the result with error range. Parameters: * data (DataFrame): a pandas DataFrame containing time-series data. * indicator (str): the indicator for which the curve will be fitted. * country (str): the country for which the curve will be fitted. * error (bool, optional): whether to plot the error range. Default is False. Returns: * Original figure : if error is False. * Error figure: if error is True """ # defining our data data = data.set_index(['Series Name', 'Country Name']) data = data.loc[indicator] xdata = np.array([float(year) for year in range(1980, 2020)]) ydata = data.loc[country] # defining our logistic function def logistic(t, n0, g, t0): return n0 / (1 + np.exp(-g * (t - t0))) param, covar = curve_fit( logistic, xdata, ydata, p0=(3e12, 0.1, 1980)) year = np.linspace(1980, 2030, 40) forecast = logistic(year, *param) sigma = err.error_prop(year, logistic, param, covar) up = forecast + sigma low = forecast - sigma #2030 GDP Prediction number = forecast[-1] plt.figure(figsize=(10,6)) plt.plot(xdata, ydata, label='Original Data') plt.plot(year, forecast, label='Best Fit (Logestic)', linestyle='--') plt.xlabel('Year') plt.ylabel('U.S. GDP') plt.title('Evolution of US GDP Across Years') plt.text(2010, 3e12, f' 2030 Prediction : {number / 1e12:.3f} trillion' f'\n\n Uncertainty Range: [{low[-1] / 1e12:.3f}' f'{up[-1] / 1e12:.3f}] trillion',fontsize=10, color='Blue') if error: plt.fill_between(year, low, up, color='yellow', alpha=0.5, label='Error Range') plt.legend(facecolor='white')def lineplot(df, indicator, countries, years, title): """ Generates a line plot for a specified indicator, countries, and years. Parameters: * df (DataFrame): A pandas DataFrame containing the data. * indicator (str): The indicator for which the line plot will be generated. * countries (list): A list of country names to include in the plot. * years (list): A list of years to include in the plot. * title (str): The title of the line plot. Returns: * None: The function generates and displays the line plot using Matplotlib. """ deta = df.loc[indicator] deta = deta.loc[countries] deta.dropna(inplace=True, axis=1) x = deta.columns.tolist() fig, ax = plt.subplots(figsize=(10, 7)) #cstm = [str(year) for year in range(1990,2019,3)] for country in countries: ax.plot(x, deta.loc[country], label=country) ax.legend(facecolor='white') ax.set_xticks(years) ax.axhline(0, color='red', linestyle='--') ax.set_xlabel('Years') plt.title('Inflation Rates Per Country (annual %)') plt.ylabel('Rate')def horizentalbar(df, indicator, countries, years, xlabel, title): """ Generates a horizontal bar plot for a specified indicator, countries, and years. Parameters: * df (DataFrame): A pandas DataFrame containing the data. * indicator (str): The indicator for which the horizontal bar plot will be generated. * countries (list): A list of country names to include in the plot. * years (list): A list of years to include in the plot. * xlabel (str): The label for the x-axis. * title (str) : The title for the hbar plot Returns: * None: The function generates and displays the horizontal bar plot using Matplotlib. """ my_data = df.loc['Debt'] df.dropna(inplace=True) my_data = my_data.loc[countries] my_data = my_data[years] xlabel = xlabel my_data.plot(kind='barh', xlabel=xlabel, figsize=(10, 6)) plt.legend(facecolor='white') plt.title(title)def barplot(df, indicator, countries, ylabel, title): """ Generates a bar plot for a specified indicator and countries. Parameters: * df (DataFrame): A pandas DataFrame containing the data. * indicator (str): The indicator for which the bar plot will be generated. * countries (list): A list of country names to include in the plot. Returns: * None: The function generates and displays the bar plot using Matplotlib. """ deta = df.loc[indicator] deta = deta.loc[countries] deta = deta[years] # plt.figure(figsize=(15,10)) deta.plot(kind='bar', stacked=True, figsize=(10, 6)) plt.ylabel(ylabel) plt.title(title) plt.legend(facecolor='white')def bubblechart(df, x, y, year, size_col, title): """ Generates a bubble chart for a specified DataFrame and columns. Parameters: * df (DataFrame): A pandas DataFrame containing the data. * x (str): The column to be used for the x-axis. * y (str): The column to be used for the y-axis. * year (str): The specific year for which the data will be plotted. * size_col (str): The column to be used for the size of the bubbles. * title (str): The title of the bubble chart. Returns: * None: The function generates and displays the bubble chart using Seaborn and Matplotlib. """ plt.figure(figsize=(7, 4)) # Transposing DataFrame data = df.T # Removing Outliers & Null Values data = data[data[x] < 0.5e13] data = data.loc[year] data = data.reset_index() data.drop(columns='Country Name', inplace=True) data.set_index('Series Name', inplace=True) data.dropna(inplace=True) # Turning dataframe into series data = data[year] x = data.loc[x].values y = data.loc[y].values s = data.loc[size_col].values # plotting data : sns.scatterplot(x=y, y=x, size=s, sizes=(20, 200), alpha=0.7, hue=s) plt.xlabel('GDP') plt.ylabel('Population') plt.legend(labels=['10K $', '20K $', '30K $', '40K $', '50K $'], title='Income/Capita') plt.title(title)# reading data :data, transp_df = read_and_transpose('data.csv')# preprocessing data :df, norm_df, scaler = preprocessing(data, 'ANIPC')# silhouette test :list_sc = silhouette(norm_df, 2, 10)# getting cluster center cordinates and labelsx_cent, y_cent, lab = clustering(norm_df, 3, 20, scaler)# defining scatter plot (cluster plot) variablesyear = "1980"y = "Growth"xlabel = "Income Per capita 1980"ylabel = "IPC growth/year [%]"title = "Clusters of Income/Capita Vs Annual Growth"# calling the clusterplot() function :clusterplot(df, year, y, x_cent, y_cent, lab, xlabel, ylabel, title)fitting(data, 'GDP', 'United States', True)# lineplotdata = data.set_index(['Series Name', 'Country Name'])countries = ['Germany', 'Gabon', 'Mali']years = [str(year) for year in range(1990, 2020, 5)]lineplot(data, 'Inflation', countries, years, 'test')# defining horizental bar plot variablescountries = ['Gabon', 'Mali', 'Turkiye', 'Kenya', 'Costa Rica']years = [str(year) for year in range(1989, 2020, 10)]xlabel = 'Total debt service (% of GNI)'title = "Total Debt For Poor & Developping Countries"# calling the horizentalbar() functionhorizentalbar(data, 'Debt', countries, years, xlabel, title)# defining barplot variablesindicator = 'Exchange Rate'countries1 = ['Burundi', 'Algeria', 'Rwanda', 'Chad', 'Gabon']countries2 = ['Germany', 'Qatar', 'Singapore']ylabel = 'Official Exchange Rate For $'title1 = 'Exchange Rate For Poor and Developping Countries Across Decades'title2 = 'Exchange Rate For Devolopped Countries Across Decades'# calling the barplot() functionbarplot(data, indicator, countries1, ylabel, title1)barplot(data, indicator, countries2, ylabel, title2)# defining bubble chart variablesx = 'GDP'y = 'Population's = 'ANIPC'year = '2016'title = 'Bubblechart of the effect of GDP/Population on IPC '# calling the bubblechart() functionbubblechart(data, x, y, year, s, title)