In [1]:
import pandas as pd
import numpy as np

## Books Data Set Cleaning

In [2]:
df = pd.read_csv('books_all.csv')
df.head()

Unnamed: 0,title,price,rating,availability
0,A Light in the Attic,Â£51.77,Three,In stock
1,Tipping the Velvet,Â£53.74,One,In stock
2,Soumission,Â£50.10,One,In stock
3,Sharp Objects,Â£47.82,Four,In stock
4,Sapiens: A Brief History of Humankind,Â£54.23,Five,In stock


In [3]:
#checking the data types of column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         1000 non-null   object
 1   price         1000 non-null   object
 2   rating        1000 non-null   object
 3   availability  1000 non-null   object
dtypes: object(4)
memory usage: 31.4+ KB


In [4]:
#changing the data type of column 'price' to float
df['price'] = df['price'].str.replace('£', '', regex=False)
df['price'] = df['price'].str.replace('Â', '', regex=False)


df['price'] = df['price'].astype(float)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         1000 non-null   object 
 1   price         1000 non-null   float64
 2   rating        1000 non-null   object 
 3   availability  1000 non-null   object 
dtypes: float64(1), object(3)
memory usage: 31.4+ KB


In [6]:
# changing rating to int
rating_map={
    'One'   : 1,
    'Two'   : 2,
    'Three' : 3,
    'Four'  : 4,
    'Five'  : 5
}

df['rating']=df['rating'].map(rating_map)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         1000 non-null   object 
 1   price         1000 non-null   float64
 2   rating        1000 non-null   int64  
 3   availability  1000 non-null   object 
dtypes: float64(1), int64(1), object(2)
memory usage: 31.4+ KB


In [7]:
df.head()

Unnamed: 0,title,price,rating,availability
0,A Light in the Attic,51.77,3,In stock
1,Tipping the Velvet,53.74,1,In stock
2,Soumission,50.1,1,In stock
3,Sharp Objects,47.82,4,In stock
4,Sapiens: A Brief History of Humankind,54.23,5,In stock


In [8]:
# keeping availability as flag
df['availability'] = df['availability'].str.strip()
df['availability']= df['availability'].apply(lambda x: 1 if 'In stock' else 0)
df.head()

Unnamed: 0,title,price,rating,availability
0,A Light in the Attic,51.77,3,1
1,Tipping the Velvet,53.74,1,1
2,Soumission,50.1,1,1
3,Sharp Objects,47.82,4,1
4,Sapiens: A Brief History of Humankind,54.23,5,1


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         1000 non-null   object 
 1   price         1000 non-null   float64
 2   rating        1000 non-null   int64  
 3   availability  1000 non-null   int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 31.4+ KB


In [12]:
df.count()

title           1000
price           1000
rating          1000
availability    1000
dtype: int64

In [11]:
# dropping duplicates from title (if any)
df['title'].drop_duplicates()

0                                   A Light in the Attic
1                                     Tipping the Velvet
2                                             Soumission
3                                          Sharp Objects
4                  Sapiens: A Brief History of Humankind
                             ...                        
995    Alice in Wonderland (Alice's Adventures in Won...
996     Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)
997    A Spy's Devotion (The Regency Spies of London #1)
998                  1st to Die (Women's Murder Club #1)
999                   1,000 Places to See Before You Die
Name: title, Length: 999, dtype: object

In [13]:
df.to_csv('books_all.csv')