#### Import Libraries

In [1]:
import pandas as pd


#### Load Dataset

In [2]:
df = pd.read_csv("C:\data-analysis-basics\week2\winemag-data-130k-v2.csv.zip")


#### Shape Before Cleaning

In [3]:
print("Shape before cleaning:", df.shape)


Shape before cleaning: (129971, 14)


#### Initial Inspection

In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             129971 non-null  int64  
 1   country                129908 non-null  object 
 2   description            129971 non-null  object 
 3   designation            92506 non-null   object 
 4   points                 129971 non-null  int64  
 5   price                  120975 non-null  float64
 6   province               129908 non-null  object 
 7   region_1               108724 non-null  object 
 8   region_2               50511 non-null   object 
 9   taster_name            103727 non-null  object 
 10  taster_twitter_handle  98758 non-null   object 
 11  title                  129971 non-null  object 
 12  variety                129970 non-null  object 
 13  winery                 129971 non-null  object 
dtypes: float64(1), int64(2), object(11)


#### Detect Missing Values

In [5]:
df.isna().sum()


Unnamed: 0                   0
country                     63
description                  0
designation              37465
points                       0
price                     8996
province                    63
region_1                 21247
region_2                 79460
taster_name              26244
taster_twitter_handle    31213
title                        0
variety                      1
winery                       0
dtype: int64

#### Drop Junk / High-Missing Columns

In [6]:
# Columns with very high missing values or low modeling value
df = df.drop(columns=["taster_twitter_handle"])


#### Handle Missing Numeric Values

In [7]:
# Fill price with median (robust to outliers)
df["price"] = df["price"].fillna(df["price"].median())


#### Handle Missing Categorical Values

In [8]:
# Fill country with mode
df["country"] = df["country"].fillna(df["country"].mode()[0])


#### Remaining Missing Values Check

In [9]:
df.isna().sum()


Unnamed: 0         0
country            0
description        0
designation    37465
points             0
price              0
province          63
region_1       21247
region_2       79460
taster_name    26244
title              0
variety            1
winery             0
dtype: int64

#### Rename Columns

In [10]:
df = df.rename(columns={
    "taster_name": "taster",
    "designation": "designation"
})


#### Data Type Fixing

In [11]:
df.dtypes


Unnamed: 0       int64
country         object
description     object
designation     object
points           int64
price          float64
province        object
region_1        object
region_2        object
taster          object
title           object
variety         object
winery          object
dtype: object

In [12]:
# Ensure numeric coercion
df["price"] = pd.to_numeric(df["price"], errors="coerce")
df["points"] = pd.to_numeric(df["points"], errors="coerce")


#### Remove Duplicates

In [13]:
df.duplicated().sum()


np.int64(0)

In [14]:
df = df.drop_duplicates()


#### Clean Text Columns

In [15]:
df["title"] = df["title"].str.lower().str.strip()
df["description"] = df["description"].str.lower().str.strip()


#### Shape After Cleaning

In [16]:
print("Shape after cleaning:", df.shape)


Shape after cleaning: (129971, 13)


#### Final Sanity Inspection

In [17]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 13 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   Unnamed: 0   129971 non-null  int64  
 1   country      129971 non-null  object 
 2   description  129971 non-null  object 
 3   designation  92506 non-null   object 
 4   points       129971 non-null  int64  
 5   price        129971 non-null  float64
 6   province     129908 non-null  object 
 7   region_1     108724 non-null  object 
 8   region_2     50511 non-null   object 
 9   taster       103727 non-null  object 
 10  title        129971 non-null  object 
 11  variety      129970 non-null  object 
 12  winery       129971 non-null  object 
dtypes: float64(1), int64(2), object(10)
memory usage: 12.9+ MB


In [18]:
df.describe()


Unnamed: 0.1,Unnamed: 0,points,price
count,129971.0,129971.0,129971.0
mean,64985.0,88.447138,34.646083
std,37519.540256,3.03973,39.664385
min,0.0,80.0,4.0
25%,32492.5,86.0,18.0
50%,64985.0,88.0,25.0
75%,97477.5,91.0,40.0
max,129970.0,100.0,3300.0


#### Save Cleaned Dataset

In [19]:
df.to_csv("cleaned_winemag_data.csv", index=False)


#### ðŸ’¥ MINI PROJECT â€” DATA CLEANING PIPELINE

In [20]:
def load_data(path):
    return pd.read_csv(path)

def clean_missing_values(df):
    df = df.drop(columns=["taster_twitter_handle"])
    df["price"] = df["price"].fillna(df["price"].median())
    df["country"] = df["country"].fillna(df["country"].mode()[0])
    return df

def fix_dtypes(df):
    df["price"] = pd.to_numeric(df["price"], errors="coerce")
    df["points"] = pd.to_numeric(df["points"], errors="coerce")
    return df

def clean_text(df):
    df["title"] = df["title"].str.lower().str.strip()
    df["description"] = df["description"].str.lower().str.strip()
    return df

def remove_duplicates(df):
    return df.drop_duplicates()

def save_clean_data(df, path):
    df.to_csv(path, index=False)


#### Run the Pipeline

In [21]:
df = load_data("C:\data-analysis-basics\week2\winemag-data-130k-v2.csv.zip")
df = clean_missing_values(df)
df = fix_dtypes(df)
df = clean_text(df)
df = remove_duplicates(df)
save_clean_data(df, "cleaned_winemag_data.csv")
