# Data Cleaning: Eliminating Duplicate Data

In [1]:
import pandas as pd
import swifteda as sa
from swifteda.analyzer import DataSetAnalyzer  # Importing the class to manually create an object

# --- Creating a test DataFrame with duplicates ---
test_data = {
    'ID': [1, 2, 3, 1, 4],
    'Name': ['Ana', 'Bruno', 'Carla', 'Ana', 'Daniel'],
    'Age': [25, 30, 22, 25, 40]
}
df_test = pd.DataFrame(test_data)

print("--- Original Test Dataset ---")
display(df_test)


# --- Using KaggleKit to analyze and clean our test DataFrame ---
# Manually creating an instance of our analyzer
test_analysis = DataSetAnalyzer(df_test)

# Using our new and elegant method chaining!
test_analysis.find_duplicates().drop_duplicates();


# --- Checking the final result ---
print("\n--- Test Dataset After Cleaning ---")
# The test_analysis object was modified 'inplace', so we can see its final state
display(test_analysis._df)

--- Original Test Dataset ---


Unnamed: 0,ID,Name,Age
0,1,Ana,25
1,2,Bruno,30
2,3,Carla,22
3,1,Ana,25
4,4,Daniel,40


🚨 Found 2 rows that are part of duplicate sets.


Unnamed: 0,ID,Name,Age
0,1,Ana,25
3,1,Ana,25



🗑️ 1 duplicate row(s) removed. 4 rows remain.

--- Test Dataset After Cleaning ---


Unnamed: 0,ID,Name,Age
0,1,Ana,25
1,2,Bruno,30
2,3,Carla,22
4,4,Daniel,40


# Feature Engineering: Testing to Extract Data from Data

In [2]:
import pandas as pd
import swifteda as sa
from swifteda.analyzer import DataSetAnalyzer

# --- Creating a test DataFrame with a date column ---
sales_data = {
    'SaleDate': ['2023-01-15', '2023-03-22', '2024-07-04', '2024-12-25'],
    'Product': ['A', 'B', 'A', 'C'],
    'Value': [100, 150, 120, 200]
}
df_sales = pd.DataFrame(sales_data)

print("--- Original Sales Dataset ---")
display(df_sales)

# --- Using KaggleKit to extract date features ---
sales_analysis = DataSetAnalyzer(df_sales)

# Our new and powerful method call!
sales_analysis.extract_datetime_features(date_column='SaleDate');


# --- Checking the final result ---
print("\n--- Sales Dataset with New Features ---")
display(sales_analysis._df)

--- Original Sales Dataset ---


Unnamed: 0,SaleDate,Product,Value
0,2023-01-15,A,100
1,2023-03-22,B,150
2,2024-07-04,A,120
3,2024-12-25,C,200


🛠️  Datetime features extracted from column 'SaleDate'.

🗑️ Column(s) ['SaleDate'] removed.

--- Sales Dataset with New Features ---


Unnamed: 0,Product,Value,SaleDate_Year,SaleDate_Month,SaleDate_Day,SaleDate_DayOfWeek,SaleDate_WeekOfYear
0,A,100,2023,1,15,6,2
1,B,150,2023,3,22,2,12
2,A,120,2024,7,4,3,27
3,C,200,2024,12,25,2,52
