In [1]:
import pandas as pd

In [2]:
#Data loading
file_path = 'Resources/shopping_data.csv'
df_shopping = pd.read_csv(file_path, encoding='ISO-8859-1')
df_shopping.head(5)

Unnamed: 0,CustomerID,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,Yes,19.0,15000,39.0
1,2,Yes,21.0,15000,81.0
2,3,No,20.0,16000,6.0
3,4,No,23.0,16000,77.0
4,5,No,31.0,17000,40.0


In [3]:
# Show columns
df_shopping.columns

Index(['CustomerID', 'Card Member', 'Age', 'Annual Income',
       'Spending Score (1-100)'],
      dtype='object')

In [4]:
# List df data types
df_shopping.dtypes

CustomerID                  int64
Card Member                object
Age                       float64
Annual Income               int64
Spending Score (1-100)    float64
dtype: object

In [5]:
# find null values
for column in df_shopping.columns:
    print(f'Column {column} has {df_shopping[column].isnull().sum()} null values, with {df_shopping[column].count()} total rows')

Column CustomerID has 0 null values, with 203 total rows
Column Card Member has 2 null values, with 201 total rows
Column Age has 2 null values, with 201 total rows
Column Annual Income has 0 null values, with 203 total rows
Column Spending Score (1-100) has 1 null values, with 202 total rows


In [6]:
# Drop null rows
df_shopping_trim = df_shopping.dropna()

In [7]:
# Find duplicate entries
print(f'Duplicate entries: {df_shopping_trim.duplicated().sum()}')

Duplicate entries: 0


In [8]:
# Remove the CustomerID column
df_shopping_trim.drop(columns=['CustomerID'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [9]:
df_shopping_trim.head()

Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,Yes,19.0,15000,39.0
1,Yes,21.0,15000,81.0
2,No,20.0,16000,6.0
3,No,23.0,16000,77.0
4,No,31.0,17000,40.0


In [10]:
# Transform string column
def change_string(member):
    if member == "Yes":
        return 1
    else:
        return 0

df_shopping_trim['Card Member'] = df_shopping_trim["Card Member"].apply(change_string)
df_shopping_trim.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15000,39.0
1,1,21.0,15000,81.0
2,0,20.0,16000,6.0
3,0,23.0,16000,77.0
4,0,31.0,17000,40.0


In [11]:
# Transform annual income
df_shopping_trim['Annual Income'] = df_shopping_trim['Annual Income'] / 1000
df_shopping_trim.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Card Member,Age,Annual Income,Spending Score (1-100)
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [16]:
df_shopping_trim = df_shopping_trim.rename(columns = {'Card Member':'Card_Member', 'Annual Income':'Annual_Income', 'Spending Score (1-100)':'Spending_score'})
df_shopping_trim.head()

Unnamed: 0,Card_Member,Age,Annual_Income,Spending_score
0,1,19.0,15.0,39.0
1,1,21.0,15.0,81.0
2,0,20.0,16.0,6.0
3,0,23.0,16.0,77.0
4,0,31.0,17.0,40.0


In [17]:
# Saving cleaned data
file_path = 'Resources/shopping_data_cleaned.csv'
df_shopping_trim.to_csv(file_path, index=False)