In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load cleaned dataset
df = pd.read_csv("AI_tools_cleaned_dataset.csv")

# FEATURE ENGINEERING

In [5]:
# 1. Tool Age Feature
current_year = 2025
df['Tool_Age'] = current_year - df['Year Founded']

In [6]:
# 2. Description Length Feature
df['Desc_Length'] = df['Short Description'].astype(str).apply(len)

In [7]:
# 3. Website Active Flag
df['Website_Active'] = df['Website Status'].apply(lambda x: 1 if x == 'Active' else 0)

In [8]:
# 4. Popular Country Flag
top_countries = df['Country'].value_counts().head(5).index
df['Top_Country'] = df['Country'].apply(lambda x: 1 if x in top_countries else 0)

In [9]:
# ENCODING 

label_cols = ['Category', 'Primary Task', 'Country', 'industry']

le = LabelEncoder()
for col in label_cols:
    df[col] = le.fit_transform(df[col])

In [10]:
# MODEL EVALUATION 

# Target: Predict Website Status (Active / Inactive)
y = df['Website_Active']

# Features
X = df[['Category', 'Primary Task', 'Country', 'industry',
        'Tool_Age', 'Desc_Length', 'Top_Country']]

In [11]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [12]:
# Model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [15]:
# Prediction
y_pred = model.predict(X_test)

In [16]:
# Accuracy
acc = accuracy_score(y_test, y_pred)

print("Model Accuracy after Feature Engineering:", round(acc * 100, 2), "%")

Model Accuracy after Feature Engineering: 80.91 %
