In [None]:
# %pip install pandas numpy scikit-learn pyarrow

import pandas as pd
import numpy as np
import warnings

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings("ignore")

In [55]:
def load_data(file_path):
    column_types = {
        'manufacturer': 'string',
        'model': 'string',
        'year': 'int32',
        'mileage': 'float32',
        'price': 'float32',
        'mpg': 'string',
        'fuel_type': 'string',
        'engine': 'string',
        'accidents_or_damage': 'float32',
        'one_owner': 'float32',
        'driver_rating': 'float32',
        'seller_rating': 'float32',
        'price_drop': 'float32'
    }

    df = pd.read_csv(
        file_path,
        dtype=column_types,
        engine='pyarrow'
    )

    # Fill NA đúng kiểu
    num_cols = df.select_dtypes(include='number').columns
    str_cols = df.select_dtypes(include='string').columns

    df[num_cols] = df[num_cols].fillna(0)
    df[str_cols] = df[str_cols].fillna('Unknown')

    return df


df_raw = load_data("cars.csv")
df_raw.head()


Unnamed: 0,manufacturer,model,year,mileage,engine,transmission,drivetrain,fuel_type,mpg,exterior_color,interior_color,accidents_or_damage,one_owner,personal_use_only,seller_name,seller_rating,driver_rating,driver_reviews_num,price_drop,price
0,Acura,ILX Hybrid 1.5L,2013,92945.0,"1.5L I-4 i-VTEC variable valve control, engine...",Automatic,Front-wheel Drive,Gasoline,39-38,Black,Parchment,0.0,0.0,0.0,Iconic Coach,0.0,4.4,12.0,300.0,13988.0
1,Acura,ILX Hybrid 1.5L,2013,47645.0,1.5L I4 8V MPFI SOHC Hybrid,Automatic CVT,Front-wheel Drive,Hybrid,39-38,Gray,Ebony,1.0,1.0,1.0,Kars Today,0.0,4.4,12.0,0.0,17995.0
2,Acura,ILX Hybrid 1.5L,2013,53422.0,1.5L I4 8V MPFI SOHC Hybrid,Automatic CVT,Front-wheel Drive,Hybrid,39-38,Bellanova White Pearl,Ebony,0.0,1.0,1.0,Weiss Toyota of South County,4.3,4.4,12.0,500.0,17000.0
3,Acura,ILX Hybrid 1.5L,2013,117598.0,1.5L I4 8V MPFI SOHC Hybrid,Automatic CVT,Front-wheel Drive,Hybrid,39-38,Polished Metal Metallic,,0.0,1.0,1.0,Apple Tree Acura,0.0,4.4,12.0,675.0,14958.0
4,Acura,ILX Hybrid 1.5L,2013,114865.0,1.5L I4 8V MPFI SOHC Hybrid,Automatic CVT,Front-wheel Drive,Hybrid,39-38,,Ebony,1.0,0.0,1.0,Herb Connolly Chevrolet,3.7,4.4,12.0,300.0,14498.0


TẠO BIẾN RỦI RO

In [56]:
df = df_raw.copy()

# Drop clearly invalid price placeholders
df = df[df['price'] > 1].copy()

df['risk'] = np.where(
    (df['accidents_or_damage'] > 0) |
    (df['driver_rating'] < 3.5),
    1,  # rủi ro cao
    0   # rủi ro thấp
)

df[['accidents_or_damage', 'driver_rating', 'risk']].head()

Unnamed: 0,accidents_or_damage,driver_rating,risk
0,0.0,4.4,0
1,1.0,4.4,1
2,0.0,4.4,0
3,0.0,4.4,0
4,1.0,4.4,1


ENCODE DỮ LIỆU CHO AI

In [57]:
df_ml = df.copy()

label_encoders = {}
cat_cols = df_ml.select_dtypes(include='string').columns

for col in cat_cols:
    le = LabelEncoder()
    df_ml[col] = le.fit_transform(df_ml[col])
    label_encoders[col] = le


TRAIN AI – DỰ ĐOÁN RỦI RO

In [58]:
features_ai = [
    'year',
    'mileage',
    'one_owner',
    'driver_rating',
    'seller_rating'
]

X = df_ml[features_ai]
y = df_ml['risk']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

rf = RandomForestClassifier(
    n_estimators=150,
    random_state=42,
    class_weight='balanced'
)

rf.fit(X_train, y_train)

print("AI Accuracy:", rf.score(X_test, y_test))


AI Accuracy: 0.718504376123555


### GIẢI THÍCH MÔ HÌNH – FEATURE IMPORTANCE
Hiển thị mức độ đóng góp của từng biến đầu vào trong mô hình RandomForest để giải thích vì sao xe bị gắn nhãn rủi ro cao.

In [59]:
feature_importance = (
    pd.DataFrame({
        'feature': features_ai,
        'importance': rf.feature_importances_
    })
    .sort_values('importance', ascending=False)
    .reset_index(drop=True)
)
feature_importance

Unnamed: 0,feature,importance
0,mileage,0.670916
1,driver_rating,0.166126
2,seller_rating,0.084382
3,year,0.072457
4,one_owner,0.006118


AHP – CHỌN TIÊU CHÍ

In [60]:
ahp_criteria = [
    'price',
    'mileage',
    'year',
    'accidents_or_damage',
    'one_owner',
    'driver_rating',
    'seller_rating',
    'mpg',
    'price_drop'
]


AHP ĐÚNG SÁCH – MA TRẬN SO SÁNH CẶP

In [61]:
A = np.array([
# price mileage year accident owner rating seller mpg drop
 [1,   3,   5,   5,   7,   3,   3,   5,   3],  # price
 [1/3, 1,   3,   3,   5,   3,   3,   3,   3],  # mileage
 [1/5, 1/3, 1,   3,   3,   3,   3,   3,   3],  # year
 [1/5, 1/3, 1/3, 1,   3,   3,   3,   3,   3],  # accident
 [1/7, 1/5, 1/3, 1/3, 1,   1,   1,   3,   1],  # owner
 [1/3, 1/3, 1/3, 1/3, 1,   1,   3,   3,   3],  # rating
 [1/3, 1/3, 1/3, 1/3, 1,   1/3, 1,   3,   3],  # seller
 [1/5, 1/3, 1/3, 1/3, 1/3, 1/3, 1/3, 1,   1],  # mpg
 [1/3, 1/3, 1/3, 1/3, 1,   1/3, 1/3, 1,   1]   # price_drop
])


In [62]:
pairwise_df = pd.DataFrame(A, index=ahp_criteria, columns=ahp_criteria)
pairwise_df.round(3)


Unnamed: 0,price,mileage,year,accidents_or_damage,one_owner,driver_rating,seller_rating,mpg,price_drop
price,1.0,3.0,5.0,5.0,7.0,3.0,3.0,5.0,3.0
mileage,0.333,1.0,3.0,3.0,5.0,3.0,3.0,3.0,3.0
year,0.2,0.333,1.0,3.0,3.0,3.0,3.0,3.0,3.0
accidents_or_damage,0.2,0.333,0.333,1.0,3.0,3.0,3.0,3.0,3.0
one_owner,0.143,0.2,0.333,0.333,1.0,1.0,1.0,3.0,1.0
driver_rating,0.333,0.333,0.333,0.333,1.0,1.0,3.0,3.0,3.0
seller_rating,0.333,0.333,0.333,0.333,1.0,0.333,1.0,3.0,3.0
mpg,0.2,0.333,0.333,0.333,0.333,0.333,0.333,1.0,1.0
price_drop,0.333,0.333,0.333,0.333,1.0,0.333,0.333,1.0,1.0


TÍNH TRỌNG SỐ AHP + KIỂM TRA CR

In [63]:
# Chuẩn hóa
col_sum = A.sum(axis=0)
A_norm = A / col_sum

# Vector trọng số
weights = A_norm.mean(axis=1)
weights = weights / weights.sum()

ahp_weights = dict(zip(ahp_criteria, weights))
ahp_weights


{'price': np.float64(0.2950498644041707),
 'mileage': np.float64(0.1798264426870655),
 'year': np.float64(0.13291078095424816),
 'accidents_or_damage': np.float64(0.10991661161861539),
 'one_owner': np.float64(0.05211739716424173),
 'driver_rating': np.float64(0.08454745531934789),
 'seller_rating': np.float64(0.06703056736220836),
 'mpg': np.float64(0.03523408471619394),
 'price_drop': np.float64(0.043366795773908465)}

In [64]:
n = A.shape[0]
Aw = A.dot(weights)
lambda_max = np.mean(Aw / weights)
CI = (lambda_max - n) / (n - 1)

RI = 1.45  # RI cho n = 9
CR = CI / RI

print(f"Lambda max (λ_max): {lambda_max:.4f}")
print(f"Consistency Index (CI): {CI:.4f}")
print(f"Consistency Ratio (CR): {CR:.4f}")

if CR < 0.1:
    print("KẾT LUẬN: CR < 0.10 → ma trận được chấp nhận")
else:
    print("KẾT LUẬN: CR ≥ 0.10 → cần rà soát lại ma trận")


Lambda max (λ_max): 10.0922
Consistency Index (CI): 0.1365
Consistency Ratio (CR): 0.0942
KẾT LUẬN: CR < 0.10 → ma trận được chấp nhận


TÍNH ĐIỂM AHP CHO MỖI XE

In [65]:
def parse_mpg(value):
    try:
        if '-' in value:
            low, high = value.split('-')
            return (float(low) + float(high)) / 2
        else:
            return float(value)
    except:
        return 0.0


df['mpg'] = df['mpg'].apply(parse_mpg)


In [66]:
df_ahp = df[ahp_criteria].copy()

benefit = ['year', 'one_owner', 'driver_rating', 'seller_rating', 'mpg', 'price_drop']
cost = ['price', 'mileage', 'accidents_or_damage']

for col in benefit:
    df_ahp[col] = df_ahp[col] / df_ahp[col].max()

for col in cost:
    df_ahp[col] = df_ahp[col].min() / (df_ahp[col] + 1e-6)

df['ahp_score'] = sum(
    df_ahp[col] * ahp_weights[col]
    for col in ahp_criteria
)


KẾT HỢP AHP + AI = DSS

In [67]:
df['risk_pred'] = rf.predict(df_ml[features_ai])

df['recommendation'] = np.where(
    (df['ahp_score'] >= df['ahp_score'].quantile(0.7)) &
    (df['risk_pred'] == 0),
    'RECOMMENDED',
    'NOT_RECOMMENDED'
)


In [68]:
df_result = df.copy()

# Rank only low-risk cars without mutating the original df
ranked = df_result[df_result['risk_pred'] == 0]['ahp_score'].rank(
    method='dense', ascending=False
)

df_result['rank'] = ranked.fillna(0).astype(int)

df_result[
    df_result['rank'] > 0
][
    ['rank', 'manufacturer', 'model', 'price',
     'ahp_score', 'risk_pred', 'recommendation']
].sort_values('rank').head(10)

Unnamed: 0,rank,manufacturer,model,price,ahp_score,risk_pred,recommendation
73551,1.0,Buick,Encore Essence,299.0,0.519753,0,RECOMMENDED
736374,2.0,Volkswagen,Jetta S,319.0,0.510019,0,RECOMMENDED
584638,3.0,Nissan,Versa SV,259.0,0.508732,0,RECOMMENDED
600748,4.0,Nissan,Murano SV,299.0,0.476031,0,RECOMMENDED
599832,4.0,Nissan,Murano SV,299.0,0.476031,0,RECOMMENDED
553053,5.0,Mitsubishi,Outlander Sport SE,399.0,0.466396,0,RECOMMENDED
567062,6.0,Nissan,Cube 1.8 SL,750.0,0.424305,0,RECOMMENDED
566508,7.0,Nissan,Maxima SE,500.0,0.40234,0,RECOMMENDED
639501,8.0,Subaru,Ascent Limited,595.0,0.384166,0,RECOMMENDED
502637,9.0,Mazda,Mazda3 i Touring,1992.0,0.370941,0,RECOMMENDED
