In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


This code mounts Google Drive to the Colab environment, allowing access to files stored in Drive.

In [2]:
import numpy as np
import pandas as pd

These lines import the NumPy and pandas libraries. NumPy is used for numerical operations, such as generating random numbers or performing calculations. Pandas is used for data manipulation and analysis, especially when working with tabular data (like CSV files).

In [3]:
df=pd.read_csv('gdrive/My Drive/Colab Notebooks/city_level_data_0_0.csv')

This line reads the CSV file named 'city_level_data_0_0.csv' from the user's Google Drive. The file is loaded into a pandas DataFrame called df, which will be used for data analysis and manipulation.

In [4]:
print(df.columns.tolist())

['iso3c', 'region_id', 'country_name', 'income_id', 'city_name', 'additional_data_annual_budget_for_waste_management_year', 'additional_data_annual_solid_waste_budget_year', 'additional_data_annual_swm_budget_2017_year', 'additional_data_annual_swm_budget_year', 'additional_data_annual_waste_budget_year', 'additional_data_collection_ton', 'additional_data_number_of_scavengers_on_dumpsites_number', 'additional_data_other_user_fees_na', 'additional_data_swm_contract_arrangement_1_year_contract_period', 'additional_data_swm_contract_arrangement_3_year_contract_period', 'additional_data_total_annual_costs_to_collect_and_dispose_of_city_s_waste_year', 'additional_data_total_swm_expenditures_year', 'additional_data_total_waste_management_budget_year', 'communication_list_of_channels_through_which_the_city_collects_feedback_from_it_residents_on_issues_related_to_solid_waste_services_na', 'communication_summary_of_key_solid_waste_information_made_periodically_available_to_the_public_na', 'comp

This line prints out a list of all the column names in the DataFrame.

In [5]:
print(df['country_name'].unique())

['Afghanistan' 'Angola' 'Albania' 'United Arab Emirates' 'Argentina'
 'Armenia' 'American Samoa' 'Australia' 'Austria' 'Azerbaijan' 'Burundi'
 'Belgium' 'Benin' 'Burkina Faso' 'Bangladesh' 'Bulgaria' 'Bahrain'
 'Bosnia and Herzegovina' 'Belarus' 'Belize' 'Bolivia' 'Brazil' 'Bhutan'
 'Botswana' 'Canada' 'Switzerland' 'Chile' 'China' 'Côte d’Ivoire'
 'Cameroon' 'Congo, Dem. Rep.' 'Congo, Rep.' 'Colombia' 'Comoros'
 'Costa Rica' 'Cuba' 'Cyprus' 'Czech Republic' 'Germany' 'Djibouti'
 'Denmark' 'Dominican Republic' 'Algeria' 'Ecuador' 'Egypt, Arab Rep.'
 'Spain' 'Estonia' 'Ethiopia' 'Finland' 'Fiji' 'France'
 'Micronesia, Fed. Sts.' 'Gabon' 'United Kingdom' 'Georgia' 'Ghana'
 'Guinea' 'Gambia, The' 'Equatorial Guinea' 'Greece' 'Guatemala'
 'Honduras' 'Croatia' 'Haiti' 'Hungary' 'Indonesia' 'Isle of Man' 'India'
 'Ireland' 'Iran, Islamic Rep.' 'Iraq' 'Israel' 'Italy' 'Jordan' 'Japan'
 'Kazakhstan' 'Kenya' 'Kyrgyz Republic' 'Cambodia' 'Kiribati'
 'Korea, Rep.' 'Kuwait' 'Lao PDR' 'Lebanon' 'Li

This line displays all the unique country names found in the 'country_name' column of the DataFrame.

In [6]:
df["total_msw_total_msw_generated_tons_year"] = pd.to_numeric(
    df["total_msw_total_msw_generated_tons_year"],
    errors="coerce"
)


This line converts the values in the 'total_msw_total_msw_generated_tons_year' column to numeric data type. If any value cannot be converted (e.g., it's text or missing), it will be replaced with NaN (Not a Number). This ensures that the column is ready for mathematical operations like summing or averaging.

In [7]:
waste_by_country = df.groupby("country_name")["total_msw_total_msw_generated_tons_year"]\
                     .sum()\
                     .sort_values(ascending=False)

print(waste_by_country.head(10))


country_name
India                 2.075418e+07
Brazil                8.903979e+06
Russian Federation    7.989254e+06
China                 7.903000e+06
Saudi Arabia          6.580000e+06
Mexico                5.784915e+06
Egypt, Arab Rep.      5.475000e+06
Pakistan              5.280906e+06
Vietnam               4.909250e+06
South Africa          4.540491e+06
Name: total_msw_total_msw_generated_tons_year, dtype: float64


In [21]:
all_city_waste = df[["city_name", "total_msw_total_msw_generated_tons_year"]].copy()


all_city_waste.dropna(subset=["total_msw_total_msw_generated_tons_year"], inplace=True)


all_city_waste = all_city_waste.sort_values(by="total_msw_total_msw_generated_tons_year", ascending=False)


This block groups the dataset by country name and calculates the total waste generated per country. It sums the 'total_msw_total_msw_generated_tons_year' values for each country, sorts them in descending order (from highest to lowest), and prints the top 10 countries.

In [9]:
waste_by_country = df.groupby("country_name")["total_msw_total_msw_generated_tons_year"].sum().sort_values(ascending=False)
print(waste_by_country)


country_name
India                 20754176.75
Brazil                 8903979.00
Russian Federation     7989254.00
China                  7903000.00
Saudi Arabia           6580000.00
                         ...     
Isle of Man                  0.00
Finland                      0.00
Samoa                        0.00
Tuvalu                       0.00
West Bank and Gaza           0.00
Name: total_msw_total_msw_generated_tons_year, Length: 164, dtype: float64


This code groups the dataset by country name and calculates the total waste generated  (in tons per year) for each country by summing the values. It then sorts the countries in descending order to show which countries generate ton

In [10]:
df["diversion_rate"] = df["waste_treatment_recycling_percent"] + df["waste_treatment_compost_percent"]


This line creates a new column in the dataset called 'diversion_rate'.It adds together the percentage of waste treated through recycling and composting for each city. The diversion rate is a common metric used to show how much waste is diverted from landfills.

In [17]:
diversion_by_country = df.groupby("country_name")["diversion_rate"].mean().reset_index()

diversion_by_country = diversion_by_country.sort_values(by="diversion_rate", ascending=False)

Create a country-level subset of average diversion rates

In [11]:
top_diversion = df[["city_name", "diversion_rate"]].sort_values(by="diversion_rate", ascending=False).head(10)
low_diversion = df[["city_name", "diversion_rate"]].sort_values(by="diversion_rate", ascending=True).head(10)


These lines identify the cities with the highest and lowest diversion rates. Selects the top 10 cities with the highest diversion rates (most recycling + composting). Selects the bottom 10 cities with the lowest diversion rates (least recycling + composting).

In [12]:
df["total_msw_total_msw_generated_tons_year"] = pd.to_numeric(df["total_msw_total_msw_generated_tons_year"], errors="coerce")

top_cities_waste = df[["city_name", "total_msw_total_msw_generated_tons_year"]].sort_values(by="total_msw_total_msw_generated_tons_year", ascending=False).head(10)


This block ensures the waste data is numeric and identifies cities generating the most waste. Converts the 'total_msw_total_msw_generated_tons_year' column to numeric,coercing errors (like text) to NaN to avoid calculation issues. Selects the top 10 cities that generate the most municipal solid waste (in tons per year).

In [24]:
from google.colab import files
files.download("all_city_waste.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [23]:
all_city_waste.to_csv("all_city_waste.csv", index=False)

Download csv