In [36]:
# Import dependencies
from pymongo import MongoClient
from pprint import pprint
import pandas as pd

In [37]:
# Create an instance of MongoClient
mongo = MongoClient(port=27017)

In [38]:
# confirm that "admin" database is in MongoDb
print(mongo.list_database_names())

['admin', 'config', 'local']


In [39]:
# assign the admin database to a variable name
db = mongo['admin']

In [40]:
# review the collections (tables) in our database
print(db.list_collection_names())


['covid-data', 'system.version', 'covid', 'data-covid']


In [41]:
# assign the collection to a variable
covid_data = db['covid-data']

# Clean the Data

In [42]:
# Retrieve data from the MongoDB collection
results = covid_data.find()

In [43]:
# Create a Pandas DataFrame
df = pd.DataFrame(results)

In [44]:
# Print the number of rows in the DataFrame
print('rows in DataFrame:', len(df))

rows in DataFrame: 3546


In [45]:
# Display the first 10 rows of the DataFrame
df.head(10)

Unnamed: 0,_id,iso_code,continent,location,date,new_deaths,total_cases_per_million,median_age,aged_65_older,gdp_per_capita,...,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,extreme_poverty,female_smokers,male_smokers,hosp_patients_per_million,excess_mortality_cumulative_absolute,weekly_icu_admissions_per_million
0,64d29d73c7e1b733c9309436,AFG,Asia,Afghanistan,01/04/2020,0.0,4.036,18.6,2.581,1803.987,...,,,,,,,,,,
1,64d29d73c7e1b733c9309437,AFG,Asia,Afghanistan,01/07/2020,2.0,762.459,18.6,2.581,1803.987,...,,,,,,,,,,
2,64d29d73c7e1b733c9309438,AFG,Asia,Afghanistan,01/10/2020,0.0,954.757,18.6,2.581,1803.987,...,,,,,,,,,,
3,64d29d73c7e1b733c9309439,AFG,Asia,Afghanistan,01/01/2021,12.0,1276.795,18.6,2.581,1803.987,...,,,,,,,,,,
4,64d29d73c7e1b733c930943a,AFG,Asia,Afghanistan,01/04/2021,5.0,1374.148,18.6,2.581,1803.987,...,,,,,,,,,,
5,64d29d73c7e1b733c930943b,AFG,Asia,Afghanistan,01/07/2021,91.0,2922.917,18.6,2.581,1803.987,...,,,,,,,,,,
6,64d29d73c7e1b733c930943c,AFG,Asia,Afghanistan,01/10/2021,2.0,3770.329,18.6,2.581,1803.987,...,,,,,,,,,,
7,64d29d73c7e1b733c930943d,AFG,Asia,Afghanistan,01/01/2022,0.0,3843.027,18.6,2.581,1803.987,...,,,,,,,,,,
8,64d29d73c7e1b733c930943e,AFG,Asia,Afghanistan,01/04/2022,5.0,4321.719,18.6,2.581,1803.987,...,,,,,,,,,,
9,64d29d73c7e1b733c930943f,AFG,Asia,Afghanistan,01/07/2022,1.0,4437.964,18.6,2.581,1803.987,...,,,,,,,,,,


In [46]:
# Examine data type of each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3546 entries, 0 to 3545
Data columns (total 22 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   _id                                   3546 non-null   object 
 1   iso_code                              3546 non-null   object 
 2   continent                             3378 non-null   object 
 3   location                              3546 non-null   object 
 4   date                                  3546 non-null   object 
 5   new_deaths                            3444 non-null   float64
 6   total_cases_per_million               3281 non-null   float64
 7   median_age                            2797 non-null   float64
 8   aged_65_older                         2699 non-null   float64
 9   gdp_per_capita                        2741 non-null   float64
 10  handwashing_facilities                1345 non-null   float64
 11  new_tests_smoothe

In [47]:
# Change data types for data analysis
df['date'] = pd.to_datetime(df['date'])

In [48]:
# Confirm type changes were successful by examining data types again
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3546 entries, 0 to 3545
Data columns (total 22 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   _id                                   3546 non-null   object        
 1   iso_code                              3546 non-null   object        
 2   continent                             3378 non-null   object        
 3   location                              3546 non-null   object        
 4   date                                  3546 non-null   datetime64[ns]
 5   new_deaths                            3444 non-null   float64       
 6   total_cases_per_million               3281 non-null   float64       
 7   median_age                            2797 non-null   float64       
 8   aged_65_older                         2699 non-null   float64       
 9   gdp_per_capita                        2741 non-null   float64       
 10  

In [50]:
#Saving clean_df to Resources
clean_df = df
clean_df.to_csv('clean_df123.csv',index=False)