In [44]:
# Import dependencies
from pymongo import MongoClient
from pprint import pprint
import pandas as pd
import datetime as dt

In [45]:
# Create an instance of MongoClient
mongo = MongoClient(port=27017)

In [46]:
# confirm that "admin" database is in MongoDb
print(mongo.list_database_names())

['admin', 'autosaurus', 'classDB', 'config', 'coviddatadb', 'customerdb', 'fruits_db', 'gardenDB', 'local', 'test_class', 'travel_db', 'uk_food']


In [47]:
# assign the admin database to a variable name
db = mongo['coviddatadb']

In [48]:
# review the collections (tables) in our database
print(db.list_collection_names())


['cleanedcoviddata']


In [49]:
# assign the collection to a variable
covid_data = db['cleanedcoviddata']

# Clean the Data

In [50]:
# Retrieve data from the MongoDB collection
results = covid_data.find()

In [51]:
# Create a Pandas DataFrame
df = pd.DataFrame(results)

In [52]:
# Print the number of rows in the DataFrame
print('rows in DataFrame:', len(df))

rows in DataFrame: 330696


In [53]:
# Display the first 10 rows of the DataFrame
df.head(10)

Unnamed: 0,_id,iso_code,continent,location,date,stringency_index,median_age,gdp_per_capita,handwashing_facilities,total_cases,total_cases_per_million,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,total_boosters_per_hundred,female_smokers,male_smokers
0,64d910440e69a590d76a008b,AFG,Asia,Afghanistan,03/01/2020,0.0,18.6,1803.987,37.746,,,,,,,,,
1,64d910440e69a590d76a008c,AFG,Asia,Afghanistan,04/01/2020,0.0,18.6,1803.987,37.746,,,,,,,,,
2,64d910440e69a590d76a008d,AFG,Asia,Afghanistan,05/01/2020,0.0,18.6,1803.987,37.746,,,,,,,,,
3,64d910440e69a590d76a008e,AFG,Asia,Afghanistan,06/01/2020,0.0,18.6,1803.987,37.746,,,,,,,,,
4,64d910440e69a590d76a008f,AFG,Asia,Afghanistan,07/01/2020,0.0,18.6,1803.987,37.746,,,,,,,,,
5,64d910440e69a590d76a0090,AFG,Asia,Afghanistan,08/01/2020,0.0,18.6,1803.987,37.746,,,,,,,,,
6,64d910440e69a590d76a0091,AFG,Asia,Afghanistan,09/01/2020,0.0,18.6,1803.987,37.746,,,,,,,,,
7,64d910440e69a590d76a0092,AFG,Asia,Afghanistan,10/01/2020,0.0,18.6,1803.987,37.746,,,,,,,,,
8,64d910440e69a590d76a0093,AFG,Asia,Afghanistan,11/01/2020,0.0,18.6,1803.987,37.746,,,,,,,,,
9,64d910440e69a590d76a0094,AFG,Asia,Afghanistan,12/01/2020,0.0,18.6,1803.987,37.746,,,,,,,,,


In [41]:
# Examine data type of each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 330696 entries, 0 to 330695
Data columns (total 18 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   _id                         330696 non-null  object 
 1   iso_code                    330696 non-null  object 
 2   continent                   315000 non-null  object 
 3   location                    330696 non-null  object 
 4   date                        330696 non-null  object 
 5   stringency_index            197651 non-null  float64
 6   median_age                  261029 non-null  float64
 7   gdp_per_capita              255826 non-null  float64
 8   handwashing_facilities      125570 non-null  float64
 9   total_cases                 293300 non-null  float64
 10  total_cases_per_million     293300 non-null  float64
 11  total_vaccinations          77350 non-null   float64
 12  people_vaccinated           74071 non-null   float64
 13  people_fully_v

In [56]:
# Change data types for data analysis
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y')

In [57]:
# Confirm type changes were successful by examining data types again
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 330696 entries, 0 to 330695
Data columns (total 18 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   _id                         330696 non-null  object        
 1   iso_code                    330696 non-null  object        
 2   continent                   315000 non-null  object        
 3   location                    330696 non-null  object        
 4   date                        330696 non-null  datetime64[ns]
 5   stringency_index            197651 non-null  float64       
 6   median_age                  261029 non-null  float64       
 7   gdp_per_capita              255826 non-null  float64       
 8   handwashing_facilities      125570 non-null  float64       
 9   total_cases                 293300 non-null  float64       
 10  total_cases_per_million     293300 non-null  float64       
 11  total_vaccinations          77350 non-n

In [58]:
#Saving clean_df to Resources
clean_df = df
clean_df.to_csv('clean_df-new1.csv',index=False)