In [10]:
# %pip install pandas numpy scikit-learn pyarrow

import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')

# --- H√ÄM ƒê·ªåC D·ªÆ LI·ªÜU ---
def load_data_optimized(file_path):
    print(f"üîÑ ƒêang ƒë·ªçc file '{file_path}'...")
    column_types = {
        'manufacturer': 'string', 'model': 'string',
        'year': 'int32', 'mileage': 'float32', 'price': 'float32',
        'mpg': 'string', 'fuel_type': 'string', 'engine': 'string',
        'accidents_or_damage': 'float32', 'one_owner': 'float32',
        'driver_rating': 'float32'
    }
    cols = list(column_types.keys())

    try:
        df = pd.read_csv(file_path, usecols=cols, dtype=column_types, engine='pyarrow')
    except:
        try:
            df = pd.read_csv(file_path, usecols=cols, dtype=column_types)
        except FileNotFoundError:
            # D·ªØ li·ªáu gi·∫£ l·∫≠p n·∫øu kh√¥ng c√≥ file
            print("‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y file, ƒëang t·∫°o d·ªØ li·ªáu m·∫´u...")
            return pd.DataFrame({
                'manufacturer': ['Toyota', 'Ford', 'Honda', 'BMW', 'Audi'] * 100,
                'model': ['Camry', 'F-150', 'Civic', '320i', 'A4'] * 100,
                'year': [2015, 2018, 2012, 2010, 2021] * 100,
                'mileage': [40000, 30000, 120000, 150000, 10000] * 100,
                'price': [15000, 25000, 8000, 9000, 45000] * 100,
                'mpg': ['30-35', '18-24', '32-40', '25-30', '20-25'] * 100,
                'fuel_type': ['Hybrid', 'Gasoline', 'Gasoline', 'Diesel', 'Electric'] * 100, 
                'engine': ['2.5L', '5.0L V8', '1.8L', '2.0L Turbo', 'Electric'] * 100, 
                'accidents_or_damage': [0, 0, 1, 1, 0] * 100,
                'one_owner': [1, 1, 0, 0, 1] * 100,
                'driver_rating': [4.5, 4.8, 3.5, 3.8, 5.0] * 100
            })

    df = df.fillna(0) 
    print(f"‚úÖ ƒê√£ t·∫£i xong {len(df):,} d√≤ng d·ªØ li·ªáu.")
    return df

df_raw = load_data_optimized('cars.csv')

üîÑ ƒêang ƒë·ªçc file 'cars.csv'...
‚úÖ ƒê√£ t·∫£i xong 762,091 d√≤ng d·ªØ li·ªáu.


In [11]:
print("ü§ñ ƒêang hu·∫•n luy·ªán AI ƒë√°nh gi√° r·ªßi ro...")

# 1. T·∫°o nh√£n R·ªßi ro gi·∫£ l·∫≠p ƒë·ªÉ d·∫°y m√°y
def create_risk_label(row):
    score = 0
    if row['accidents_or_damage'] == 1: score += 3 
    if row['year'] < 2015: score += 1              
    if row['mileage'] > 100000: score += 2         
    if row['one_owner'] == 0: score += 1           
    
    if score <= 1: return 'Low'
    elif score <= 3: return 'Medium'
    else: return 'High'

# L·∫•y m·∫´u train
df_train = df_raw.sample(min(50000, len(df_raw))).copy() 
df_train['Risk_Level'] = df_train.apply(create_risk_label, axis=1)

# 2. Hu·∫•n luy·ªán Random Forest
features = ['year', 'mileage', 'accidents_or_damage', 'one_owner', 'driver_rating']
X = df_train[features]
y = df_train['Risk_Level']

clf = RandomForestClassifier(n_estimators=50, random_state=42)
clf.fit(X, y)

print(f"‚úÖ AI ƒë√£ h·ªçc xong!")

ü§ñ ƒêang hu·∫•n luy·ªán AI ƒë√°nh gi√° r·ªßi ro...
‚úÖ AI ƒë√£ h·ªçc xong!


In [12]:
# =======================================================
# NH·∫¨P ƒê·ªò QUAN TR·ªåNG T·∫†I ƒê√ÇY
# =======================================================
PRICE_vs_YEAR      = 3      # Gi√° vs NƒÉm
PRICE_vs_MILEAGE   = 2      # Gi√° vs ODO
PRICE_vs_MPG       = 5      
PRICE_vs_ACCIDENTS = 1      
PRICE_vs_BRAND     = 2      
PRICE_vs_FUEL      = 3      
PRICE_vs_ENGINE    = 5      

YEAR_vs_MILEAGE    = 1/2    
YEAR_vs_MPG        = 2      
YEAR_vs_ACCIDENTS  = 1/3    
YEAR_vs_BRAND      = 1      
YEAR_vs_FUEL       = 2      
YEAR_vs_ENGINE     = 3      

MILEAGE_vs_MPG       = 3    
MILEAGE_vs_ACCIDENTS = 1/2  
MILEAGE_vs_BRAND     = 2    
MILEAGE_vs_FUEL      = 2    
MILEAGE_vs_ENGINE    = 3

MPG_vs_ACCIDENTS     = 1/5  
MPG_vs_BRAND         = 1/2  
MPG_vs_FUEL          = 1    
MPG_vs_ENGINE        = 2    

ACCIDENTS_vs_BRAND   = 3    
ACCIDENTS_vs_FUEL    = 3    
ACCIDENTS_vs_ENGINE  = 5    

BRAND_vs_FUEL        = 2    
BRAND_vs_ENGINE      = 3    

FUEL_vs_ENGINE       = 2    

# T√≠nh tr·ªçng s·ªë
def get_ahp_weights():
    n = 8
    matrix = np.ones((n, n))
    def set_val(i, j, val): matrix[i, j] = val; matrix[j, i] = 1 / val

    # [0:Price, 1:Year, 2:Mileage, 3:MPG, 4:Accidents, 5:Brand, 6:Fuel, 7:Engine]
    set_val(0, 1, PRICE_vs_YEAR); set_val(0, 2, PRICE_vs_MILEAGE); set_val(0, 3, PRICE_vs_MPG)
    set_val(0, 4, PRICE_vs_ACCIDENTS); set_val(0, 5, PRICE_vs_BRAND); set_val(0, 6, PRICE_vs_FUEL); set_val(0, 7, PRICE_vs_ENGINE)
    set_val(1, 2, YEAR_vs_MILEAGE); set_val(1, 3, YEAR_vs_MPG); set_val(1, 4, YEAR_vs_ACCIDENTS)
    set_val(1, 5, YEAR_vs_BRAND); set_val(1, 6, YEAR_vs_FUEL); set_val(1, 7, YEAR_vs_ENGINE)
    set_val(2, 3, MILEAGE_vs_MPG); set_val(2, 4, MILEAGE_vs_ACCIDENTS); set_val(2, 5, MILEAGE_vs_BRAND)
    set_val(2, 6, MILEAGE_vs_FUEL); set_val(2, 7, MILEAGE_vs_ENGINE)
    set_val(3, 4, MPG_vs_ACCIDENTS); set_val(3, 5, MPG_vs_BRAND); set_val(3, 6, MPG_vs_FUEL); set_val(3, 7, MPG_vs_ENGINE)
    set_val(4, 5, ACCIDENTS_vs_BRAND); set_val(4, 6, ACCIDENTS_vs_FUEL); set_val(4, 7, ACCIDENTS_vs_ENGINE)
    set_val(5, 6, BRAND_vs_FUEL); set_val(5, 7, BRAND_vs_ENGINE)
    set_val(6, 7, FUEL_vs_ENGINE)
    
    return (matrix / matrix.sum(axis=0)).mean(axis=1)

weights = get_ahp_weights()
print("‚úÖ ƒê√£ t√≠nh xong tr·ªçng s·ªë AHP!")

‚úÖ ƒê√£ t√≠nh xong tr·ªçng s·ªë AHP!


In [13]:
def process_data(df, weights, ai_model):
    df = df.copy()
    
    # --- 1. CHU·∫®N B·ªä D·ªÆ LI·ªÜU T√çNH AHP ---
    # MPG
    def parse_mpg(val):
        try:
            s = str(val)
            if '-' in s: return (float(s.split('-')[0]) + float(s.split('-')[1])) / 2
            return float(s)
        except: return 0
    df['mpg_num'] = df['mpg'].apply(parse_mpg)

    # Nhi√™n li·ªáu & Th∆∞∆°ng hi·ªáu & ƒê·ªông c∆°
    def score_fuel(val):
        val = str(val).lower()
        if 'electric' in val or 'hybrid' in val: return 1.0 
        if 'gasoline' in val: return 0.5   
        return 0.3
    df['fuel_score'] = df['fuel_type'].apply(score_fuel)

    brand_counts = df['manufacturer'].value_counts(normalize=True)
    df['brand_score'] = df['manufacturer'].map(brand_counts)
    
    le = LabelEncoder()
    df['engine_score'] = le.fit_transform(df['engine'].astype(str))
    if df['engine_score'].max() != df['engine_score'].min():
         df['engine_score'] = (df['engine_score'] - df['engine_score'].min()) / (df['engine_score'].max() - df['engine_score'].min())
    else: df['engine_score'] = 0

    # --- 2. T√çNH ƒêI·ªÇM AHP ---
    df_norm = pd.DataFrame()
    # Nh√≥m C√†ng th·∫•p c√†ng t·ªët
    for col in ['price', 'mileage', 'accidents_or_damage']:
        val = df[col].astype(float)
        df_norm[col] = (val.max() - val) / (val.max() - val.min()) if val.max() != val.min() else 0
    # Nh√≥m C√†ng cao c√†ng t·ªët
    for col_raw, col_norm in [('year', 'year'), ('mpg_num', 'mpg'), ('brand_score', 'brand'), ('fuel_score', 'fuel'), ('engine_score', 'engine')]:
        val = df[col_raw].astype(float)
        df_norm[col_norm] = (val - val.min()) / (val.max() - val.min()) if val.max() != val.min() else 0
    
    df['AHP_Score'] = (
        df_norm['price'] * weights[0] + df_norm['year'] * weights[1] +
        df_norm['mileage'] * weights[2] + df_norm['mpg'] * weights[3] +
        df_norm['accidents_or_damage'] * weights[4] + df_norm['brand'] * weights[5] +
        df_norm['fuel'] * weights[6] + df_norm['engine'] * weights[7]
    )
    
    # --- 3. L·ªåC TOP 20 ---
    top_20 = df.sort_values(by='AHP_Score', ascending=False).head(20).copy()
    
    # --- 4. CH·∫†Y AI CHO TOP 20 ---
    ai_features = ['year', 'mileage', 'accidents_or_damage', 'one_owner', 'driver_rating']
    top_20['Predicted_Risk'] = ai_model.predict(top_20[ai_features])
    
    # Gi·∫£i th√≠ch
    def explain_why(row):
        reasons = []
        if row['accidents_or_damage'] == 1: reasons.append("T·ª´ng tai n·∫°n")
        if row['year'] < 2015: reasons.append(f"ƒê·ªùi s√¢u ({row['year']})")
        if row['mileage'] > 100000: reasons.append(f"Odo cao")
        if row['one_owner'] == 0: reasons.append("Nhi·ªÅu ch·ªß")
        if row['driver_rating'] < 4.0: reasons.append(f"Rating th·∫•p")
        if not reasons: return "‚úÖ An to√†n"
        return "‚ö†Ô∏è " + ", ".join(reasons)

    top_20['Reason_Explanation'] = top_20.apply(explain_why, axis=1)
    
    return top_20

final_top_20 = process_data(df_raw, weights, clf)
print("‚úÖ ƒê√£ x·ª≠ l√Ω xong d·ªØ li·ªáu!")

‚úÖ ƒê√£ x·ª≠ l√Ω xong d·ªØ li·ªáu!


In [14]:
# Format hi·ªÉn th·ªã
format_dict = {'price': '${:,.0f}', 'mileage': '{:,.0f}', 'AHP_Score': '{:.4f}'}

def highlight_risk(val):
    if val == 'High': return 'color: red; font-weight: bold'
    if val == 'Low': return 'color: green; font-weight: bold'
    return 'color: orange'

# ==============================================================================
# B·∫¢NG 1: CH·ªà C√ì TH√îNG TIN XE V√Ä ƒêI·ªÇM AHP (X·∫øp h·∫°ng theo s·ªü th√≠ch)
# ==============================================================================
print("\n" + "="*100)
print("üìå B·∫¢NG 1: TOP 20 XE T·ªêT NH·∫§T THEO TI√äU CH√ç AHP (CH∆ØA ƒê√ÅNH GI√Å R·ª¶I RO)")
print("="*100)

cols_ahp = [
    'manufacturer', 'model', 'price', 'year', 'mileage', 
    'mpg', 'fuel_type', 'engine', 'AHP_Score'
]

# Hi·ªÉn th·ªã
try:
    display(final_top_20[cols_ahp].style.format(format_dict))
except:
    print(final_top_20[cols_ahp].to_string(index=False))

# Xu·∫•t CSV B·∫£ng 1
final_top_20[cols_ahp].to_csv('bang_1_ahp_ranking.csv', index=False)





üìå B·∫¢NG 1: TOP 20 XE T·ªêT NH·∫§T THEO TI√äU CH√ç AHP (CH∆ØA ƒê√ÅNH GI√Å R·ª¶I RO)
manufacturer             model   price  year  mileage   mpg fuel_type                                engine  AHP_Score
        Ford          Maverick 42971.0  2023    507.0 40-33    Hybrid            Gas/Electric I-4 2.5 L/152   0.954786
        Ford   Maverick LARIAT 44971.0  2023    768.0 40-33    Hybrid            Gas/Electric I-4 2.5 L/152   0.954751
        Ford      Maverick XLT 30788.0  2022  10493.0 42-33    Hybrid            Gas/Electric I-4 2.5 L/152   0.953007
        Ford      Maverick XLT 35686.0  2022  11050.0 42-33    Hybrid            Gas/Electric I-4 2.5 L/152   0.952932
        Ford  Fusion Hybrid SE 21995.0  2018  45070.0 43-41    Hybrid            Gas/Electric I-4 2.0 L/122   0.946797
        Ford  Fusion Hybrid SE 21417.0  2018  48106.0 43-41    Hybrid            Gas/Electric I-4 2.0 L/122   0.946397
        Ford  F-150 King Ranch 63998.0  2021  21226.0 16-22    Hybrid Twin Turb

In [15]:
# ==============================================================================
# B·∫¢NG 2: K·∫æT H·ª¢P ƒê√ÅNH GI√Å R·ª¶I RO C·ª¶A AI (H·ªó tr·ª£ ra quy·∫øt ƒë·ªãnh)
# ==============================================================================
print("\n" + "="*100)
print("ü§ñ B·∫¢NG 2: K·∫æT QU·∫¢ PH√ÇN T√çCH R·ª¶I RO T·ª™ AI (CHO TOP 20 XE TR√äN)")
print("="*100)

cols_risk = [
    'manufacturer', 'model', 'price', 'year', 
    'AHP_Score',        # ƒêi·ªÉm c≈©
    'Predicted_Risk',   # AI th√™m v√†o
    'Reason_Explanation'# L√Ω do
]

# Hi·ªÉn th·ªã
try:
    display(final_top_20[cols_risk].style.format(format_dict).applymap(highlight_risk, subset=['Predicted_Risk']))
except:
    print(final_top_20[cols_risk].to_string(index=False))

# Xu·∫•t CSV B·∫£ng 2
final_top_20[cols_risk].to_csv('bang_2_ai_risk.csv', index=False)
print("\n‚úÖ ƒê√£ xu·∫•t 2 file: 'bang_1_ahp_ranking.csv' v√† 'bang_2_ai_risk.csv' th√†nh c√¥ng!")


ü§ñ B·∫¢NG 2: K·∫æT QU·∫¢ PH√ÇN T√çCH R·ª¶I RO T·ª™ AI (CHO TOP 20 XE TR√äN)
manufacturer             model   price  year  AHP_Score Predicted_Risk Reason_Explanation
        Ford          Maverick 42971.0  2023   0.954786            Low          ‚úÖ An to√†n
        Ford   Maverick LARIAT 44971.0  2023   0.954751            Low          ‚úÖ An to√†n
        Ford      Maverick XLT 30788.0  2022   0.953007            Low          ‚úÖ An to√†n
        Ford      Maverick XLT 35686.0  2022   0.952932            Low          ‚úÖ An to√†n
        Ford  Fusion Hybrid SE 21995.0  2018   0.946797            Low       ‚ö†Ô∏è Nhi·ªÅu ch·ªß
        Ford  Fusion Hybrid SE 21417.0  2018   0.946397            Low       ‚ö†Ô∏è Nhi·ªÅu ch·ªß
        Ford  F-150 King Ranch 63998.0  2021   0.945999            Low       ‚ö†Ô∏è Nhi·ªÅu ch·ªß
        Ford      F-150 Lariat 53726.0  2021   0.945939            Low          ‚úÖ An to√†n
        Ford  C-Max Energi SEL 18598.0  2016   0.944893            Low  