# Spring Cleaning!

Harold's stock data is a mess! Help him clean up his data before the auditors arrive!

In [2]:
# Import Libraries
import pandas as pd
from pathlib import Path

### Load CSV data into Pandas using `read_csv`

In [5]:
csv_path = Path("../../Resources/stock_data.csv")
csv_data = pd.read_csv(csv_path)

### Identify the number of rows and columns (shape) in the DataFrame.

In [9]:
csv_data.shape

(504, 14)

### Generate a sample of the data to visually ensure data has been loaded in correctly.

In [10]:
csv_data.sample(5)

Unnamed: 0,symbol,name,sector,price,price_per_earnings,dividend_yield,earnings_per_share,52_week_low,52_week_high,market_cap,ebitda,price_per_sales,price_per_book,sec_filings
362,PM,Philip Morris International,Consumer Staples,100.39,22.36,4.328479,4.48,123.55,96.66,153580700000.0,11802000000.0,2.757468,1318.7,http://www.sec.gov/cgi-bin/browse-edgar?action...
193,FISV,Fiserv Inc,Information Technology,133.05,27.32,0.0,4.14,144.2,106.2,26918950000.0,1911000000.0,6.518713,11.54,http://www.sec.gov/cgi-bin/browse-edgar?action...
161,ETN,Eaton Corporation,Industrials,79.41,17.08,2.940456,6.68,89.85,69.45,35961770000.0,4253000000.0,1.816174,2.17,http://www.sec.gov/cgi-bin/browse-edgar?action...
357,PEP,PepsiCo Inc.,Consumer Staples,110.15,21.51,2.837004,4.36,122.51,104.77,161413300000.0,12843000000.0,3.670506,,http://www.sec.gov/cgi-bin/browse-edgar?action...
480,WBA,Walgreens Boots Alliance,Consumer Staples,68.22,13.38,2.236824,3.78,88.0,63.82,70862540000.0,7083000000.0,0.590439,3.06,http://www.sec.gov/cgi-bin/browse-edgar?action...


### Identify the number of records in the DataFrame, and compare it with the number of rows in the original file.

In [12]:
csv_data.count()

symbol                504
name                  502
sector                501
price                 500
price_per_earnings    497
dividend_yield        499
earnings_per_share    498
52_week_low           500
52_week_high          500
market_cap            500
ebitda                492
price_per_sales       500
price_per_book        492
sec_filings           500
dtype: int64

### Identify nulls records

In [17]:
csv_data.isnull().sum() # /csv_data.shape[0]

symbol                 0
name                   2
sector                 3
price                  4
price_per_earnings     7
dividend_yield         5
earnings_per_share     6
52_week_low            4
52_week_high           4
market_cap             4
ebitda                12
price_per_sales        4
price_per_book        12
sec_filings            4
dtype: int64

### Drop Null Records

In [19]:
csv_data = csv_data.dropna()

### Validate nulls have been dropped

In [21]:
csv_data.isnull().sum()

symbol                0
name                  0
sector                0
price                 0
price_per_earnings    0
dividend_yield        0
earnings_per_share    0
52_week_low           0
52_week_high          0
market_cap            0
ebitda                0
price_per_sales       0
price_per_book        0
sec_filings           0
dtype: int64

### Default null `ebitda` values to 0. Then, validate no records are null for ebitda.

### Drop Duplicates

In [23]:
csv_data = csv_data.drop_duplicates()

csv_data.head()

Unnamed: 0,symbol,name,sector,price,price_per_earnings,dividend_yield,earnings_per_share,52_week_low,52_week_high,market_cap,ebitda,price_per_sales,price_per_book,sec_filings
0,MMM,3M Company,Industrials,$222.89,24.31,2.332862,$7.92,259.77,175.49,138721100000.0,9048000000.0,4.390271,11.34,http://www.sec.gov/cgi-bin/browse-edgar?action...
2,ABT,Abbott Laboratories,Health Care,56.27,22.51,1.908982,0.26,64.6,42.28,102121000000.0,5744000000.0,3.74048,3.19,http://www.sec.gov/cgi-bin/browse-edgar?action...
3,ABBV,AbbVie Inc.,Health Care,108.48,19.41,2.49956,3.29,125.86,60.05,181386300000.0,10310000000.0,6.291571,26.14,http://www.sec.gov/cgi-bin/browse-edgar?action...
5,AYI,Acuity Brands Inc,Industrials,108.48,18.22,0.351185,7.43,225.36,142.0,6242378000.0,587800000.0,1.795347,3.55,http://www.sec.gov/cgi-bin/browse-edgar?action...
6,ADBE,Adobe Systems Inc,Information Technology,185.16,52.31,0.0,3.39,204.45,114.451,94550210000.0,2538040000.0,13.092818,11.06,http://www.sec.gov/cgi-bin/browse-edgar?action...
