In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load dataset
df = pd.read_csv("AI Tools Dataset.csv")

print("Original Dataset Shape:", df.shape)

Original Dataset Shape: (16762, 9)


In [2]:
# Display first 5 rows
print("\nFirst 5 rows:")
print(df.head())


First 5 rows:
                Name                    Category  \
0            ChatGPT   Communication And Support   
1             Claude   Operations And Management   
2      Google Gemini     Ai And Machine Learning   
3  Microsoft Copilot  Technology And Development   
4              LLaMA     Ai And Machine Learning   

                                 Primary Task  Year Founded  \
0  Automated conversational customer service.        2022.0   
1                             Task automation        2023.0   
2                       Large Language Models        2023.0   
3                    Automated conversational        2023.0   
4                       Large Language Models        2022.0   

                                   Short Description        Country  \
0  ChatGPT is an advanced AI language model by Op...        Estonia   
1  Automated assistant handles tasks and talks na...  United States   
2  Google Gemini is Google DeepMind’s next-genera...            NaN   
3  Micros

In [3]:
# Check missing values
print("\nMissing values before cleaning:")
print(df.isnull().sum())


Missing values before cleaning:
Name                     0
Category                 0
Primary Task             0
Year Founded         10931
Short Description      530
Country              10134
industry              8935
Website                  0
Website Status          27
dtype: int64


In [4]:
# Handling Missing Values 

# Fill numeric missing values with column mean
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# Fill categorical missing values with mode
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

print("\nMissing values after cleaning:")
print(df.isnull().sum())


Missing values after cleaning:
Name                 0
Category             0
Primary Task         0
Year Founded         0
Short Description    0
Country              0
industry             0
Website              0
Website Status       0
dtype: int64


In [5]:
# Normalization 

scaler = MinMaxScaler()

if len(numeric_cols) > 0:
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    print("\nNumeric columns normalized using Min-Max Scaling")


Numeric columns normalized using Min-Max Scaling


In [6]:
# Save Cleaned Dataset 

df.to_csv("AI_tools_cleaned_dataset.csv", index=False)

print("\nCleaned dataset saved as AI_tools_cleaned_dataset.csv")


Cleaned dataset saved as AI_tools_cleaned_dataset.csv
