# Data Handling

Loading the dataset from the csv, removing duplicates and corrupted values.

In [9]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import csv

df = pd.read_csv("Amazon Items Scraped.csv")
print('Shape before cleaning: ' + str(df.shape))

# Remove Corrupted Rows ---------------------------------------------------------------------
df = df.dropna(axis = 0, thresh=8)
print('Shape after removing corrupted: ' + str(df.shape))

# Drop Duplicates --------------------------------------------------------------------------
df = df.drop_duplicates(subset = None, keep = 'first')
print('Shape after removing duplicates: ' + str(df.shape))


Shape before cleaning: (13134, 8)
Shape after removing corrupted: (5230, 8)
Shape after removing duplicates: (5116, 8)


Cleaning and normalizing the data

The data we've got has non-informational data so we have to clean each feature to fit our needs. (For example: for the 'Price' field we have to remove the $).

In [10]:
# Function - Delete char in string ---------------------------------------------------------

def deleteCharInString(df, c) :
    ilist = list()
    for i in df :
        ilist.append(i.replace(c, ''))
    df = ilist
    return df

In [11]:
# Cleaning the Brand name ------------------------------------------------------------------

df.Brand = deleteCharInString(df.Brand, "Brand:")
df.Brand = deleteCharInString(df.Brand, "Visit the ")
df.Brand = deleteCharInString(df.Brand, " Store")

iBrand = list()
for i in df.Brand :
    if i == "0" :
        iBrand.append(np.nan)
    else :
        iBrand.append(i)

df.Brand = iBrand
df = df.dropna(axis = 0, thresh=8)

print(df.Brand)

0                 Amazon
4          Amazon Basics
5                 Amazon
7                 Amazon
8                 EDIVIA
              ...       
12972     American Eagle
12991             Amazon
13002             Amazon
13031             Amazon
13117          Frito-Lay
Name: Brand, Length: 3454, dtype: object


In [12]:
# Cleaning the Ratings ------------------------------------------------------------------

df.Rating = deleteCharInString(df.Rating, " ratings")
df.Rating = deleteCharInString(df.Rating, " rating")
df.Rating = deleteCharInString(df.Rating, ",")
df["Rating"] = df.Rating.astype(float)
print(df.Rating)

0         64803.0
4          7851.0
5        145478.0
7        117007.0
8            82.0
           ...   
12972      5999.0
12991      3950.0
13002      7520.0
13031      3014.0
13117     17706.0
Name: Rating, Length: 3454, dtype: float64


In [13]:
# Cleaning the Stars ------------------------------------------------------------------

df.Stars = deleteCharInString(df.Stars, " out of 5")

ilist = list()
for i in df.Stars :
    ilist.append(i.replace("How confident are you in finding a size that will fit you well?", '0'))
df.Stars = ilist
ilist = list()
for i in df.Stars :
    ilist.append(i.replace("|", '0'))
df.Stars = ilist
ilist = list()
for i in df.Stars :
    ilist.append(i.replace("confirmed", '0'))
df.Stars = ilist
ilist = list()
for i in df.Stars :
    ilist.append(i.replace("4,8 von 5", '4.8'))
df.Stars = ilist

df["Stars"] = df.Stars.astype(float)
df.loc[df['Stars'] == 0, 'Stars'] = np.nan


df.Stars



0        4.5
4        4.6
5        4.0
7        4.7
8        4.3
        ... 
12972    4.9
12991    4.9
13002    4.9
13031    4.9
13117    4.6
Name: Stars, Length: 3454, dtype: float64

In [14]:
# Cleaning the Price ------------------------------------------------------------------

df.Price = deleteCharInString(df.Price, "$")
df.Price = deleteCharInString(df.Price, ",")

iPrice = list()

for i in df.Price :
    if i == "0" :
        iPrice.append(np.nan)
    else :
        iPrice.append(i)

df.Price = iPrice
df = df.dropna(axis = 0, thresh=8)
df["Price"] = df.Price.astype(float)
print(df.Price)

8         16.99
10        13.95
12        29.88
16        30.40
17        14.44
          ...  
12972     25.00
12991    100.00
13002     10.00
13031     30.00
13117     28.00
Name: Price, Length: 2706, dtype: float64


In [15]:
# Cleaning the Comments ------------------------------------------------------------------

df.Comments = deleteCharInString(df.Comments, "+")
df.Comments = deleteCharInString(df.Comments, " answered questions")
df["Comments"] = df.Comments.astype(float)
df.loc[df['Comments'] == 0, 'Comments'] = np.nan
print(df.Comments)


8           NaN
10         28.0
12       1000.0
16          NaN
17        277.0
          ...  
12972       4.0
12991      15.0
13002      12.0
13031       3.0
13117      33.0
Name: Comments, Length: 2706, dtype: float64


Creating a cleaned csv file 'Amazon Items Cleaned.csv'

In [16]:
# Reset DataFrame Index and delete NaN rows ---------------------------------------------------------------

df = df.dropna(axis = 0, thresh=8)
df = df.reset_index(drop=True)

df.to_csv("Amazon Items Cleaned.csv", index = None)
df.shape

(1636, 8)