# Collect data: Covid-19 cases 
*****

### The Data 
I will use the data obtained from Wikipedia.<br>
https://en.wikipedia.org/wiki/Template:COVID-19_pandemic_data/United_States_medical_cases


# Collect data from Wikipedia

In [1]:
import pandas as pd
import datetime

# Scraping the data
url = 'https://en.wikipedia.org/wiki/Template:COVID-19_pandemic_data/United_States_medical_cases'
dfs = pd.read_html(url,header=1, skiprows=0, parse_dates=True)

In [2]:
# Look at the columns
dfs[0].columns

Index(['Date', 'AK', 'AZ', 'CA', 'CO', 'HI', 'ID', 'MT', 'NM', 'NV', 'OR',
       'UT', 'WA', 'WY', 'IA', 'IL', 'IN', 'KS', 'MI', 'MN', 'MO', 'ND', 'NE',
       'OH', 'OK', 'SD', 'WI', 'AL', 'AR', 'FL', 'GA', 'KY', 'LA', 'MS', 'NC',
       'SC', 'TN', 'TX', 'VA', 'WV', 'CT', 'DC', 'DE', 'MA', 'MD', 'ME', 'NH',
       'NJ', 'NY', 'PA', 'RI', 'VT', 'GU', 'MP', 'PR', 'VI', 'Date.1', 'Daily',
       'Total', 'Daily.1', 'Total.1', 'Daily.2', 'Total.2', 'Total.3'],
      dtype='object')

In [3]:
# Pick up columns that you want to use

df = dfs[0][['Date', 'Daily', 'Daily.1']].dropna()
df

Unnamed: 0,Date,Daily
0,21-Jan-20,1
1,24-Jan-20,1
2,25-Jan-20,1
3,26-Jan-20,2
4,30-Jan-20,1
...,...,...
357,24-Jan-21,130885
358,25-Jan-21,130142
359,26-Jan-21,129487
361,Total,Confirmed


In [4]:
# Rename the columns in detail

df.columns = ['Date', 'Daily', 'Daily.1']

## Clean the data by removing noises

In [5]:
# Convert datatype to date and int 

def read_type(date,case):
    try:
        date = datetime.datetime.strptime(date, '%d-%b-%y')
        return date, int(case)
    except:
        return None

df = df.apply(lambda x: read_type(x['Date'], x['Daily Cases']), axis=1, result_type='broadcast').dropna()

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 345 entries, 0 to 359
Data columns (total 2 columns):
Date           345 non-null datetime64[ns]
Daily Cases    345 non-null object
dtypes: datetime64[ns](1), object(1)
memory usage: 8.1+ KB


## Save the dataset in a csv file

In [7]:
path = 'Data/'

df.to_csv(path+'covid_data.csv',index=False)

In [8]:
df

Unnamed: 0,Date,Daily Cases
0,2020-01-21,1
1,2020-01-24,1
2,2020-01-25,1
3,2020-01-26,2
4,2020-01-30,1
...,...,...
355,2021-01-22,175347
356,2021-01-23,159728
357,2021-01-24,130885
358,2021-01-25,130142
