## Data Cleaning

## 1. Dataset walkthrough



In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('./movie_ratings_not_clean.txt')
df.head()

Unnamed: 0,UserID,Braveheart,TheEnglishPatient,Titanic,ShakespeareinLove,AmericanBeauty,Gladiator,ABeautifulMind,Chicago,TheLordoftheRings:TheReturnoftheKing,MillionDollarBaby,Crash,TheDeparted,NoCountryforOldMen,SlumdogMillionaire,TheHurtLocker
0,Olivia,3.0,40.0,2.0,4,,1.0,2,,2.0,4.0,3.0,2.0,5.0,4.0,1
1,Noah,3.0,5.0,5.0,1,3.0,4.0,0,,1.0,5.0,4.0,3.0,0.0,0.0,2
2,Emma,2.0,1.0,3.0,3,5.0,5.0,5,,3.0,3.0,0.0,2.0,4.0,2.0,4
3,Liam,0.0,1.0,3.0,0,3.0,5.0,1,,0.0,1.0,4.0,1.0,3.0,3.0,3
4,Ava,3.0,4.0,2.0,5,0.0,99.0,1,,,5.0,5.0,5.0,1.0,,5


In [None]:
df.describe()

Unnamed: 0,Braveheart,TheEnglishPatient,Titanic,Gladiator,Chicago,TheLordoftheRings:TheReturnoftheKing,MillionDollarBaby,Crash,TheDeparted,NoCountryforOldMen,SlumdogMillionaire,TheHurtLocker
count,42.0,39.0,47.0,42.0,0.0,33.0,47.0,45.0,40.0,45.0,47.0,50.0
mean,2.761905,9.051282,2.659574,4.619048,,2.424242,2.851064,2.6,2.325,2.666667,2.489362,2.28
std,1.960706,32.019689,1.564228,15.003987,,1.750541,1.693706,1.656941,1.845125,1.809068,1.816259,1.750102
min,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,1.0,1.0,,1.0,1.5,1.0,0.75,1.0,1.0,1.0
50%,3.0,3.0,3.0,2.0,,2.0,3.0,3.0,2.0,3.0,3.0,2.0
75%,4.0,4.5,4.0,4.0,,4.0,4.0,4.0,4.0,4.0,4.0,4.0
max,9.0,200.0,5.0,99.0,,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 16 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   UserID                                49 non-null     object 
 1   Braveheart                            42 non-null     float64
 2   TheEnglishPatient                     39 non-null     float64
 3   Titanic                               47 non-null     float64
 4   ShakespeareinLove                     44 non-null     object 
 5   AmericanBeauty                        44 non-null     object 
 6   Gladiator                             42 non-null     float64
 7   ABeautifulMind                        44 non-null     object 
 8   Chicago                               0 non-null      float64
 9   TheLordoftheRings:TheReturnoftheKing  33 non-null     float64
 10  MillionDollarBaby                     47 non-null     float64
 11  Crash                

## 2. Understanding Pandas methods

In [None]:
users_df = pd.DataFrame({
    'UserID': [1, 2, 3],
    'Name': ['Alice', 'Bob', 'Charlie']
})

In [None]:
purchases_df = pd.DataFrame({
    'UserID': [1, 1, 2],
    'Product': ['Book', 'Pen', 'Book']
})

In [None]:
users_df

Unnamed: 0,UserID,Name
0,1,Alice
1,2,Bob
2,3,Charlie


In [None]:
purchases_df

Unnamed: 0,UserID,Product
0,1,Book
1,1,Pen
2,2,Book


### Merge df

In [None]:
merged_df = pd.merge(users_df, purchases_df, on='UserID')
merged_df

Unnamed: 0,UserID,Name,Product
0,1,Alice,Book
1,1,Alice,Pen
2,2,Bob,Book


### Groupby

In [None]:
purchase_counts = merged_df.groupby('Name').count()
purchase_counts

Unnamed: 0_level_0,UserID,Product
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alice,2,2
Bob,1,1


### Messy dataframe

In [None]:
messy_df = pd.DataFrame({
    'user': ['Zoe', 'Liam', 'Mia', 'Liam'],
    'movie': ['A', 'B', 'C', 'B'],
    'rating': [5, 3, np.nan, 3]
})

In [None]:
messy_df

Unnamed: 0,user,movie,rating
0,Zoe,A,5.0
1,Liam,B,3.0
2,Mia,C,
3,Liam,B,3.0


### Drop NA

In [None]:
dropped_na_df = messy_df.dropna()
dropped_na_df

Unnamed: 0,user,movie,rating
0,Zoe,A,5.0
1,Liam,B,3.0
3,Liam,B,3.0


### Fill NA

In [None]:
filled_na_df = messy_df.fillna(value=0)
filled_na_df

Unnamed: 0,user,movie,rating
0,Zoe,A,5.0
1,Liam,B,3.0
2,Mia,C,0.0
3,Liam,B,3.0


### Drop Duplicates

In [None]:
unique_df = messy_df.drop_duplicates()
unique_df

Unnamed: 0,user,movie,rating
0,Zoe,A,5.0
1,Liam,B,3.0
2,Mia,C,


## 3. Cleaning the Dataset

In [None]:
df.shape

(50, 16)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

(50, 16)

In [None]:
df.dropna(subset=['UserID'], inplace=True)

In [None]:
df.shape

(49, 16)

In [None]:
df.head()

Unnamed: 0,UserID,Braveheart,TheEnglishPatient,Titanic,ShakespeareinLove,AmericanBeauty,Gladiator,ABeautifulMind,Chicago,TheLordoftheRings:TheReturnoftheKing,MillionDollarBaby,Crash,TheDeparted,NoCountryforOldMen,SlumdogMillionaire,TheHurtLocker
0,Olivia,3.0,40.0,2.0,4,,1.0,2,,2.0,4.0,3.0,2.0,5.0,4.0,1
1,Noah,3.0,5.0,5.0,1,3.0,4.0,0,,1.0,5.0,4.0,3.0,0.0,0.0,2
2,Emma,2.0,1.0,3.0,3,5.0,5.0,5,,3.0,3.0,0.0,2.0,4.0,2.0,4
3,Liam,0.0,1.0,3.0,0,3.0,5.0,1,,0.0,1.0,4.0,1.0,3.0,3.0,3
4,Ava,3.0,4.0,2.0,5,0.0,99.0,1,,,5.0,5.0,5.0,1.0,,5


In [None]:
# First, ensure the column is numeric, converting errors to NaN
df['Braveheart'] = pd.to_numeric(df['Braveheart'], errors='coerce')

# Then, keep only the rows where the rating is 5 or less
cleaned_df = df[df['Braveheart'] <= 5]

In [None]:
cleaned_df.describe()

Unnamed: 0,Braveheart,TheEnglishPatient,Titanic,Gladiator,Chicago,TheLordoftheRings:TheReturnoftheKing,MillionDollarBaby,Crash,TheDeparted,NoCountryforOldMen,SlumdogMillionaire,TheHurtLocker
count,40.0,32.0,38.0,36.0,0.0,27.0,37.0,36.0,31.0,36.0,37.0,40.0
mean,2.6,10.3125,2.710526,5.055556,,2.407407,2.864865,2.583333,1.967742,2.638889,2.648649,2.45
std,1.736486,35.295469,1.57528,16.189846,,1.715523,1.718561,1.64534,1.834555,1.759103,1.829029,1.753385
min,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.75,2.0,1.0,,1.0,1.0,1.0,0.0,1.0,1.0,1.0
50%,3.0,3.0,3.0,2.0,,2.0,3.0,3.0,2.0,3.0,3.0,2.5
75%,4.0,4.25,4.0,4.0,,4.0,4.0,4.0,3.5,4.0,4.0,4.0
max,5.0,200.0,5.0,99.0,,5.0,5.0,5.0,5.0,5.0,5.0,5.0


## 4. Numpy

### Speed

In [None]:
import numpy as np
import time

In [None]:
list_size = 1_000_000
python_list = list(range(list_size))
numpy_array = np.array(python_list)

In [None]:
print("--- Using Python List ---")
start_time = time.time()

list_sum = 0
for number in python_list:
    list_sum += number

end_time = time.time()
list_duration = end_time - start_time
print(f"Sum: {list_sum}")
print(f"Time taken: {list_duration:.6f} seconds")

--- Using Python List ---
Sum: 499999500000
Time taken: 0.112150 seconds


In [None]:
print("\n--- Using NumPy Array ---")
start_time = time.time()

numpy_sum = np.sum(numpy_array)

end_time = time.time()
numpy_duration = end_time - start_time
print(f"Sum: {numpy_sum}")
print(f"Time taken: {numpy_duration:.6f} seconds")


--- Using NumPy Array ---
Sum: 499999500000
Time taken: 0.001263 seconds


In [None]:
print("\n--- Conclusion ---")
print(f"NumPy was {list_duration / numpy_duration:.2f} times faster.")


--- Conclusion ---
NumPy was 88.77 times faster.


### Mulit-dimensional arrays

In [None]:
print("--- 1. One-Dimensional Array (Vector) ---")
# Like a single list
arr_1d = np.array([10, 20, 30, 40])
print(arr_1d)
print(f"Dimensions: {arr_1d.ndim}")
print(f"Shape: {arr_1d.shape}")
print(f"Accessing the second element: {arr_1d[1]}") # Prints 20

--- 1. One-Dimensional Array (Vector) ---
[10 20 30 40]
Dimensions: 1
Shape: (4,)
Accessing the second element: 20


In [None]:
print("\n--- 2. Two-Dimensional Array (Matrix) ---")
# Like a spreadsheet or a grid (rows and columns)
arr_2d = np.array([
    [1, 2, 3],
    [4, 5, 6]
])
print(arr_2d)
print(f"Dimensions: {arr_2d.ndim}")
print(f"Shape: {arr_2d.shape}")
print(f"Accessing element at row 1, column 2: {arr_2d[1, 2]}")


--- 2. Two-Dimensional Array (Matrix) ---
[[1 2 3]
 [4 5 6]]
Dimensions: 2
Shape: (2, 3)
Accessing element at row 1, column 2: 6


In [None]:
print("\n--- 3. Three-Dimensional Array (Tensor) ---")
# Like a cube of data or a collection of matrices
arr_3d = np.array([
    [[1, 2], [3, 4]],    # First 2D matrix (depth 0)
    [[5, 6], [7, 8]]     # Second 2D matrix (depth 1)
])
print(arr_3d)
print(f"Dimensions: {arr_3d.ndim}")
print(f"Shape: {arr_3d.shape}")
print(f"Accessing element at depth 1, row 0, column 1: {arr_3d[1, 0, 1]}")


--- 3. Three-Dimensional Array (Tensor) ---
[[[1 2]
  [3 4]]

 [[5 6]
  [7 8]]]
Dimensions: 3
Shape: (2, 2, 2)
Accessing element at depth 1, row 0, column 1: 6
