In [1]:
# Import dependencies
from pymongo import MongoClient
from pprint import pprint
import pandas as pd

In [2]:
# Create an instance of MongoClient
mongo = MongoClient(port=27017)

In [3]:
# confirm that "admin" database is in MongoDb
print(mongo.list_database_names())

['admin', 'config', 'local']


In [4]:
# assign the admin database to a variable name
db = mongo['admin']

In [5]:
# review the collections (tables) in our database
print(db.list_collection_names())


['covid-data', 'system.version', 'covid', 'data-covid']


In [6]:
# assign the collection to a variable
covid_data = db['covid-data']

# Clean the Data

In [7]:
# Retrieve data from the MongoDB collection
results = covid_data.find()

In [8]:
# Create a Pandas DataFrame
df = pd.DataFrame(results)

In [9]:
# Print the number of rows in the DataFrame
print('rows in DataFrame:', len(df))

rows in DataFrame: 330696


In [10]:
# Display the first 10 rows of the DataFrame
df.head(10)

Unnamed: 0,_id,iso_code,continent,location,date,stringency_index,median_age,gdp_per_capita,handwashing_facilities,total_cases,total_cases_per_million,total_deaths,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,total_boosters_per_hundred,female_smokers,male_smokers
0,64d559ae041874b85a853f70,AFG,Asia,Afghanistan,2020-01-03,0.0,18.6,1803.987,37.746,,,,,,,,,,
1,64d559ae041874b85a853f71,AFG,Asia,Afghanistan,2020-01-04,0.0,18.6,1803.987,37.746,,,,,,,,,,
2,64d559ae041874b85a853f72,AFG,Asia,Afghanistan,2020-01-05,0.0,18.6,1803.987,37.746,,,,,,,,,,
3,64d559ae041874b85a853f73,AFG,Asia,Afghanistan,2020-01-06,0.0,18.6,1803.987,37.746,,,,,,,,,,
4,64d559ae041874b85a853f74,AFG,Asia,Afghanistan,2020-01-07,0.0,18.6,1803.987,37.746,,,,,,,,,,
5,64d559ae041874b85a853f75,AFG,Asia,Afghanistan,2020-01-08,0.0,18.6,1803.987,37.746,,,,,,,,,,
6,64d559ae041874b85a853f76,AFG,Asia,Afghanistan,2020-01-09,0.0,18.6,1803.987,37.746,,,,,,,,,,
7,64d559ae041874b85a853f77,AFG,Asia,Afghanistan,2020-01-10,0.0,18.6,1803.987,37.746,,,,,,,,,,
8,64d559ae041874b85a853f78,AFG,Asia,Afghanistan,2020-01-11,0.0,18.6,1803.987,37.746,,,,,,,,,,
9,64d559ae041874b85a853f79,AFG,Asia,Afghanistan,2020-01-12,0.0,18.6,1803.987,37.746,,,,,,,,,,


In [11]:
# Examine data type of each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 330696 entries, 0 to 330695
Data columns (total 19 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   _id                         330696 non-null  object 
 1   iso_code                    330696 non-null  object 
 2   continent                   315000 non-null  object 
 3   location                    330696 non-null  object 
 4   date                        330696 non-null  object 
 5   stringency_index            197651 non-null  float64
 6   median_age                  261029 non-null  float64
 7   gdp_per_capita              255826 non-null  float64
 8   handwashing_facilities      125570 non-null  float64
 9   total_cases                 293300 non-null  float64
 10  total_cases_per_million     293300 non-null  float64
 11  total_deaths                272174 non-null  float64
 12  total_vaccinations          77350 non-null   float64
 13  people_vaccina

In [12]:
# Change data types for data analysis
df['date'] = pd.to_datetime(df['date'])

In [13]:
# Confirm type changes were successful by examining data types again
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 330696 entries, 0 to 330695
Data columns (total 19 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   _id                         330696 non-null  object        
 1   iso_code                    330696 non-null  object        
 2   continent                   315000 non-null  object        
 3   location                    330696 non-null  object        
 4   date                        330696 non-null  datetime64[ns]
 5   stringency_index            197651 non-null  float64       
 6   median_age                  261029 non-null  float64       
 7   gdp_per_capita              255826 non-null  float64       
 8   handwashing_facilities      125570 non-null  float64       
 9   total_cases                 293300 non-null  float64       
 10  total_cases_per_million     293300 non-null  float64       
 11  total_deaths                272174 non-

In [14]:
#Getting rid of missing values (NaN) using drop, inplace=true saves the 'dropped' data
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4190 entries, 4612 to 330398
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   _id                         4190 non-null   object        
 1   iso_code                    4190 non-null   object        
 2   continent                   4190 non-null   object        
 3   location                    4190 non-null   object        
 4   date                        4190 non-null   datetime64[ns]
 5   stringency_index            4190 non-null   float64       
 6   median_age                  4190 non-null   float64       
 7   gdp_per_capita              4190 non-null   float64       
 8   handwashing_facilities      4190 non-null   float64       
 9   total_cases                 4190 non-null   float64       
 10  total_cases_per_million     4190 non-null   float64       
 11  total_deaths                4190 non-null   float64

In [15]:
df.head()

Unnamed: 0,_id,iso_code,continent,location,date,stringency_index,median_age,gdp_per_capita,handwashing_facilities,total_cases,total_cases_per_million,total_deaths,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,total_boosters_per_hundred,female_smokers,male_smokers
4612,64d559af041874b85a855174,DZA,Africa,Algeria,2021-11-21,39.81,29.1,13913.839,83.741,208839.0,4650.868,6017.0,12008974.0,6697263.0,5298544.0,13167.0,0.03,0.7,30.4
4616,64d559af041874b85a855178,DZA,Africa,Algeria,2021-11-25,39.81,29.1,13913.839,83.741,209463.0,4664.765,6035.0,12076870.0,6716299.0,5340231.0,20340.0,0.05,0.7,30.4
4620,64d559af041874b85a85517c,DZA,Africa,Algeria,2021-11-29,50.0,29.1,13913.839,83.741,210152.0,4680.109,6058.0,12145830.0,6740064.0,5380385.0,25381.0,0.06,0.7,30.4
4720,64d559af041874b85a8551e0,DZA,Africa,Algeria,2022-03-09,33.59,29.1,13913.839,83.741,265323.0,5908.773,6858.0,13704895.0,7461932.0,6110712.0,490676.0,1.09,0.7,30.4
4766,64d559af041874b85a85520e,DZA,Africa,Algeria,2022-04-24,33.44,29.1,13913.839,83.741,265761.0,5918.528,6874.0,13772044.0,7840131.0,6481186.0,514063.0,1.14,0.7,30.4


In [82]:
#Saving clean_df to Resources
clean_df = df
clean_df.to_csv('clean_df-new.csv',index=False)