In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def load_and_process(filename):
    df = pd.read_csv(filename)
    
    # Method Chain 1: Remove columns not being used, deal with incorrect data, and deal with missing data
    df = (df.drop(columns=[ "product_link"])
         .replace("Not available", pd.np.nan)
         .dropna())
    
    # Method Chain 2: Create new columns
    df = (df.assign(category_type=df["category"].apply(lambda x: x.split(" > ")[0]))
          .replace({"5 star": 5, "4 star": 4, "3 star": 3, "2 star": 2, "1 star": 1}))
    
    # Method Chain 3: Deal with outliers
    q1 = df["discounted_price"].quantile(0.25)
    q3 = df["discounted_price"].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5*iqr
    upper_bound = q3 + 1.5*iqr
    df = df[(df["discounted_price"] >= lower_bound) & (df["discounted_price"] <= upper_bound)]
    
    
    return df