In [1]:
import pandas as pd
import numpy as np

# Data Cleaning & Manipulation

In [4]:
wine = pd.read_csv('../data/wine_reviews/winemag-data_first150k.csv', index_col=0)
energy = pd.read_csv('../data/energy/PJM_Load_hourly.csv', parse_dates=True, index_col=0)

**Exercise:** Make a new column in the energy dataset, showing the energy in time T-1 for each time T. Hint: there is a built-in method in Pandas that can help with this, no complicated functions needed :)

In [5]:
energy.loc[:, 'energy_t-1'] = energy.loc[:, 'PJM_Load_MW'].shift()

In [6]:
energy.head()

Unnamed: 0_level_0,PJM_Load_MW,energy_t-1
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
1998-12-31 01:00:00,29309.0,
1998-12-31 02:00:00,28236.0,29309.0
1998-12-31 03:00:00,27692.0,28236.0
1998-12-31 04:00:00,27596.0,27692.0
1998-12-31 05:00:00,27888.0,27596.0


**Exercise:** Re-read the CSV for the wine data, and write a function to take care of all the nan values. Bonus points if it includes a test.

In [8]:
wine1 = pd.read_csv('../data/wine_reviews/winemag-data_first150k.csv', index_col=0)

def fix_null_values(wine_df, cols_to_drop, cols_to_fill):
    """
    Function to remove null values in the wine dataframe.
    Args:
        wine_df (DataFrame): DataFrame from which to remove the null values.
        cols_to_drop (list): List of column names to drop rows from
        cols_to_fill (list): List of columns names to fill with "unknown"
    Returns:
        wine_df (DataFrame): DataFrame with null values removed
    """
    for col in cols_to_drop:
        indicies_to_drop = wine_df.loc[wine_df[col].isnull()]
        wine_df = wine_df[~wine_df.index.isin(indicies_to_drop.index)]
        
    for col in cols_to_fill:
        wine_df.loc[:, col] = "unknown"
        
    # checking that all null values have been removed
    if wine_df.isnull().any().any() == False:
        return wine_df
    
    # if there are still null values, returning column name to user
    else:
        all_cols_list = list(wine_df.isnull().any())
        list_cols_with_nulls = [col for i, col in enumerate(all_cols_list.index) if all_cols_list[i] is True]
        print(f"Null values still exist! Columns {list_cols_with_nulls}")
        return
        
to_drop = ["price", "country"]
to_fill = ['description', 'designation', 'points', 'province', 'region_1', 'region_2', 'variety', 'winery']

# calling funciton
fix_null_values(wine1, to_drop, to_fill)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
0,US,unknown,unknown,unknown,235.0,unknown,unknown,unknown,unknown,unknown
1,Spain,unknown,unknown,unknown,110.0,unknown,unknown,unknown,unknown,unknown
2,US,unknown,unknown,unknown,90.0,unknown,unknown,unknown,unknown,unknown
3,US,unknown,unknown,unknown,65.0,unknown,unknown,unknown,unknown,unknown
4,France,unknown,unknown,unknown,66.0,unknown,unknown,unknown,unknown,unknown
...,...,...,...,...,...,...,...,...,...,...
150925,Italy,unknown,unknown,unknown,20.0,unknown,unknown,unknown,unknown,unknown
150926,France,unknown,unknown,unknown,27.0,unknown,unknown,unknown,unknown,unknown
150927,Italy,unknown,unknown,unknown,20.0,unknown,unknown,unknown,unknown,unknown
150928,France,unknown,unknown,unknown,52.0,unknown,unknown,unknown,unknown,unknown
