In [1]:
import pandas as pd
import plotly.graph_objs as go
import plotly.express as px
import numpy as np
water = pd.read_csv('new_water_done5.csv')
green = pd.read_csv('pivoted_data_GREEN_GROWTH.csv')
inno = pd.read_csv('pivoted_data_inno.csv')
infra = pd.read_csv('pivoted_data_ITF_INFRA.csv')
MEI = pd.read_csv('MEI_time_to_year.csv')

columns_water = water.select_dtypes(include=['float'])
columns_inno = inno.select_dtypes(include=['float'])
columns_green = green.select_dtypes(include=['float'])
columns_infra = infra.select_dtypes(include=['float'])
columns_MEI = MEI.select_dtypes(include=['float'])

ModuleNotFoundError: No module named 'plotly'

In [184]:
import pandas as pd
import plotly.express as px

# Read the data from CSV
df = pd.read_csv('new_water_done5.csv')

# Calculate the average values per year
averages = df.groupby('Year')[['NATIONAL-Basic', 'URBAN-Basic', 'RURAL-Basic']].mean().reset_index()

# Melt the dataframe to long format
averages = pd.melt(averages, id_vars='Year', value_vars=['NATIONAL-Basic', 'URBAN-Basic', 'RURAL-Basic'],
                   var_name='Column', value_name='Average Value')

# Create a line plot for the averages
fig = px.line(averages, x='Year', y='Average Value', color='Column', markers=True, 
              labels={'Column': 'Legend Label'})

# Configure the layout
fig.update_layout(
    title='Average Water Quality per Year',
    xaxis_title='Year',
    yaxis_title='Average Water Quality %',
    showlegend=True  # Display the legend
)

# Print the average values at the beginning and end of each line
for column in averages['Column'].unique():
    avg_start = averages.loc[averages['Column'] == column, 'Average Value'].iloc[0]
    avg_end = averages.loc[averages['Column'] == column, 'Average Value'].iloc[-1]
    difference = avg_end - avg_start
    print(f'{column} Start: {avg_start:.2f}')
    print(f'{column} End: {avg_end:.2f}')
    print(f'Improvement in %: {difference:.2f}')
    print()

# Show the plot
fig.show()


NATIONAL-Basic Start: 83.28
NATIONAL-Basic End: 89.53
Improvement in %: 6.25

URBAN-Basic Start: 91.40
URBAN-Basic End: 94.03
Improvement in %: 2.63

RURAL-Basic Start: 72.19
RURAL-Basic End: 79.68
Improvement in %: 7.49



In [185]:
avg_GDP = green.groupby('ISO3')['Real GDP per capita'].mean().reset_index()

avg_national_basic = water.groupby('ISO3')['NATIONAL-Basic'].mean().reset_index()
#veranderen naar ISO3 evt
eenvalue = green.groupby('ISO3')['Loss of natural and semi-natural vegetated land, % since 1992'].median()
green_veg_loss = pd.DataFrame({'ISO3': eenvalue.index, 'Loss': eenvalue.values})
gdp = green.groupby('ISO3')['Real GDP per capita'].mean()
gdp_green = pd.DataFrame({'ISO3': gdp.index, 'GDP': gdp.values})
water_basic = water.groupby('ISO3')['NATIONAL-Basic'].mean()
water_basic_filt = pd.DataFrame({'ISO3': water_basic.index, 'Basic': water_basic.values})

In [187]:
import pandas as pd

# Define a custom function to get the last non-missing value
def last_non_missing_value(series):
    non_missing_values = series.dropna()
    return non_missing_values.iloc[-1] if len(non_missing_values) > 0 else None

# Group the data by "COU" and apply the custom function to get the last non-missing value
last_values = green.groupby('ISO3')['Renewable energy supply, % total energy supply'].apply(last_non_missing_value)

# Create a new DataFrame with "COU" and last non-missing values
last_df = pd.DataFrame({'ISO3': last_values.index, 'Last Value': last_values.values})

# Display the new DataFrame
print(last_df)



    ISO3  Last Value
0    ABW         NaN
1    AFG         NaN
2    AGO       58.07
3    AIA         NaN
4    ALB       33.88
..   ...         ...
231  WSM         NaN
232  YEM        6.05
233  ZAF        6.21
234  ZMB       81.80
235  ZWE       77.33

[236 rows x 2 columns]


In [188]:
merged_data = pd.merge(last_df, water_basic_filt, on='ISO3', how='inner')
merged_data = pd.merge(merged_data, gdp_green, on='ISO3', how='inner')
merged_data = pd.merge(merged_data, green_veg_loss, on='ISO3', how='inner')

# Display the merged DataFrame
print(merged_data)


    ISO3  Last Value      Basic           GDP    Loss
0    ABW         NaN  96.822766  38237.170909  10.040
1    AFG         NaN  49.265605   1807.108000   2.170
2    AGO       58.07  50.094761   6173.841818   1.195
3    AIA         NaN  97.044593           NaN   0.455
4    ALB       33.88  91.149894  10087.662727   1.990
..   ...         ...        ...           ...     ...
211  WSM         NaN  90.218088   5373.420455   0.000
212  YEM        6.05  50.401790   3162.222727   2.495
213  ZAF        6.21  89.467232  11695.595455   1.430
214  ZMB       81.80  56.974370   2904.625909   2.385
215  ZWE       77.33  67.535750   2390.374545   3.885

[216 rows x 5 columns]


In [189]:

data = [
    go.Parcoords(
        #line = dict(color = merged_data['ISO3'], colorscale = 'Jet', showscale = True),
        dimensions = [
            dict(range = [merged_data['GDP'].min(), merged_data['GDP'].max()],
                 label = 'GDP',
                 values = merged_data['GDP']),
            dict(range = [merged_data['Basic'].min(), merged_data['Basic'].max()],
                 label = 'Basic water avalability',
                 values = merged_data['Basic']),
            dict(range = [merged_data['Last Value'].min(), merged_data['Last Value'].max()],
                 label = '% Renewable energy',
                 values = merged_data['Last Value']),
            dict(range = [merged_data['Loss'].min(), merged_data['Loss'].max()],
                 label = 'Loss forest area since 2000',
                 values = merged_data['Loss'])
        ]
    )
]

# Define the layout of the parallel plot
layout = go.Layout(
    title = 'Parallel Plot',
    plot_bgcolor = 'white',
    paper_bgcolor = 'white'
)

# Create the figure and display the parallel plot
fig = go.Figure(data=data, layout=layout)
fig.show()

In [190]:
merged_data['GDP_Bins'] = pd.cut(merged_data['GDP'], bins=5)

#print(merged_data[ 'GDP_Bins'])


In [191]:
import pandas as pd


# Exclude non-numeric columns
numeric_columns = merged_data.select_dtypes(include=[np.number]).columns

# Group by the GDP bins and calculate the average for each dimension
averages = merged_data.groupby('GDP_Bins')[numeric_columns].mean()

# Reset the index to make the GDP bins a column again
averages = averages.reset_index()

# Print the resulting averages DataFrame
#print(averages)


In [192]:
import plotly.express as px

# Prepare the data for parallel coordinates plot
plot_data = averages[['GDP', 'Basic', 'Last Value', 'Loss']].reset_index(drop=True)

# Create the parallel coordinates plot using Plotly
fig = px.parallel_coordinates(plot_data)

# Update layout
fig.update_layout(
    title='Average Value for Each Dimension within GDP Bins',
    xaxis_title='Dimensions',
    yaxis_title='Average Value'
)

# Show the plot
fig.show()




import pandas as pd

# Exclude non-numeric columns
numeric_columns = merged_data.select_dtypes(include=[np.number]).columns

# Group by the GDP bins and calculate the average for each dimension
averages = merged_data.groupby(pd.cut(merged_data['GDP'], bins=[0, 20, 40, 60, 80, 110], right=False))[numeric_columns].mean()

# Reset the index to make the GDP bins a column again
averages = averages.reset_index()

# Rename the GDP bin column
averages.rename(columns={'GDP': 'GDP_Bins'}, inplace=True)

# Print the resulting averages DataFrame
print(averages)


import pandas as pd

merged_data['GDP_Bins'] = pd.cut(merged_data['GDP'], bins=[0, 20000, 40000, 60000, 80000, 110000], right=False)
cutgdp, _= pd.cut(merged_data['GDP'], bins=[0, 20000, 40000, 60000, 80000, 110000], retbins=True)
# cutqhisto, cutqscheid = pd.qcut(tips_df['tip'], q=3, labels=['Low', 'Medium', 'High'], retbins=True)
# Exclude non-numeric columns
numeric_columns = merged_data.select_dtypes(include=[np.number]).columns

# Group by the GDP bins and calculate the average for each dimension
averages = merged_data.groupby('GDP_Bins')[numeric_columns].mean()

# Reset the index to make the GDP bins a column again
averages = averages.reset_index()

# Print the resulting averages DataFrame
# print(averages)

# print(cutgdp)

In [167]:
import plotly.express as px

# Prepare the data for parallel coordinates plot
plot_data = averages[['GDP', 'Basic', 'Last Value', 'Loss']].reset_index(drop=True)

# Create the parallel coordinates plot using Plotly
fig = px.parallel_coordinates(plot_data)

# Update layout
fig.update_layout(
    title='Average Value for Each Dimension within GDP Bins',
    xaxis_title='Dimensions',
    yaxis_title='Average Value'
)

# Show the plot
fig.show()



tips_df = pd.read_csv('https://raw.githubusercontent.com/pandas-dev/pandas/main/pandas/tests/io/data/csv/tips.csv')


cutqhisto, cutqscheid = pd.qcut(tips_df['tip'], q=3, labels=['Low', 'Medium', 'High'], retbins=True)



fig = go.Figure(data=go.Parcats(dimensions=[
        {'label': 'Tip',
         'values': cutqhisto,
         'categoryorder': 'category descending'
        },
        {'label': 'Sex',
         'values': tips_df['sex'],
         'categoryorder': 'category ascending'},
        {'label': 'Smoker',
         'values': tips_df['smoker']},
        {'label': 'Day',
         'values': tips_df['day']},
        {'label': 'Time',
         'values': tips_df['time']},
        {'label': 'Size',
         'values': tips_df['size'],
        'categoryorder': 'category ascending'},
        ],
         line=dict(color=cutqhisto.map(colors))
                               ))

    
fig.update_layout(
       title='Variables leading to tip size'
    
)

fig.show()

tips_df = pd.read_csv('https://raw.githubusercontent.com/pandas-dev/pandas/main/pandas/tests/io/data/csv/tips.csv')


cutqhisto, cutqscheid = pd.qcut(tips_df['tip'], q=3, labels=['Low', 'Medium', 'High'], retbins=True)





fig = go.Figure(data=go.Parcats(dimensions=[
        {'label': 'Tip',
         'values': cutgdp,
         'categoryorder': 'category descending'
        },
        {'label': 'Sex',
         'values': merged_data['Basic'],
         'categoryorder': 'category ascending'},
        {'label': 'Smoker',
         'values': merged_data['Last Value']},
        {'label': 'Day',
         'values': merged_data['Loss']},
        ]))
         #line=dict(color=cutqhisto.map(colors))
    
fig.update_layout(
       title='Variables leading to tip size'
    
)

fig.show()

import plotly.graph_objects as go

cutgdp, _ = pd.qcut(merged_data['GDP'], q=3, labels=['Low', 'Medium', 'High'], retbins=True)

# cutqhisto, cutqscheid = pd.qcut(tips_df['tip'], q=3, labels=['Low', 'Medium', 'High'], retbins=True)
# # Convert cutgdp to categorical type with string labels
# cutgdp = pd.cut(merged_data['GDP'], bins=[0, 20000, 40000, 60000, 80000, 110000], labels=['0-20', '20-40', '40-60', '60-80', '80-110'])

fig = go.Figure(data=go.Parcats(dimensions=[
        {'values': cutgdp,
         'categoryorder': 'category descending'
        },
        {
         'values': merged_data['Basic'],
         'categoryorder': 'category ascending'},
        {'label': 'Smoker',
         'values': merged_data['Last Value']},
        {'label': 'Day',
         'values': merged_data['Loss']},
    ]))

fig.update_layout(
       title='Variables leading to tip size',
       autosize=False,
       width=1000,
       height=1000
)

fig.show()
