In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:

!pip install pandas numpy scikit-learn xgboost lightgbm


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.utils.class_weight import compute_class_weight


train = pd.read_csv('/content/drive/My Drive/extra/train.csv')
test = pd.read_csv('/content/drive/My Drive/extra/test.csv')
sample_submission = pd.read_csv('/content/drive/My Drive/extra/sample_submission.csv')


print("训练集信息：")
print(train.info())
print("\n测试集信息：")
print(test.info())


训练集信息：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370703 entries, 0 to 370702
Data columns (total 24 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          370703 non-null  int64  
 1   trans_num   370703 non-null  object 
 2   trans_date  370703 non-null  object 
 3   trans_time  370703 non-null  object 
 4   unix_time   370703 non-null  int64  
 5   category    370703 non-null  object 
 6   amt         370703 non-null  float64
 7   cc_num      370703 non-null  int64  
 8   first       370703 non-null  object 
 9   last        370703 non-null  object 
 10  gender      370703 non-null  object 
 11  street      370703 non-null  object 
 12  city        370703 non-null  object 
 13  state       370703 non-null  object 
 14  zip         370703 non-null  int64  
 15  lat         370703 non-null  float64
 16  long        370703 non-null  float64
 17  city_pop    370703 non-null  int64  
 18  job         370703 non-null  object 


In [5]:
print("训练集列名称：", train.columns.tolist())
print("测试集列名称：", test.columns.tolist())

time_column_name = 'trans_date'

train[time_column_name] = pd.to_datetime(train[time_column_name])
test[time_column_name] = pd.to_datetime(test[time_column_name])

train['year'] = train[time_column_name].dt.year
train['month'] = train[time_column_name].dt.month
train['day'] = train[time_column_name].dt.day
train['hour'] = train['trans_time'].str.split(':').str[0].astype(int)

test['year'] = test[time_column_name].dt.year
test['month'] = test[time_column_name].dt.month
test['day'] = test[time_column_name].dt.day
test['hour'] = test['trans_time'].str.split(':').str[0].astype(int)


def haversine(lat1, lon1, lat2, lon2):
    R = 6371
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    dphi = np.radians(lat2 - lat1)
    dlambda = np.radians(lon2 - lon1)
    a = np.sin(dphi / 2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(dlambda / 2)**2
    return 2 * R * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

train['distance'] = haversine(train['lat'], train['long'], train['merch_lat'], train['merch_long'])
test['distance'] = haversine(test['lat'], test['long'], test['merch_lat'], test['merch_long'])

categorical_features = ['category', 'state', 'job', 'gender', 'merchant']
for col in categorical_features:
    train[col] = train[col].astype('category').cat.codes
    test[col] = test[col].astype('category').cat.codes

drop_cols = ['trans_num', 'first', 'last', 'street', 'city', 'dob', time_column_name, 'trans_time']
train = train.drop(columns=drop_cols)
test = test.drop(columns=drop_cols)

print("预处理后的训练数据：")
print(train.info())


训练集列名称： ['id', 'trans_num', 'trans_date', 'trans_time', 'unix_time', 'category', 'amt', 'cc_num', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'merchant', 'merch_lat', 'merch_long', 'is_fraud']
测试集列名称： ['id', 'trans_num', 'trans_date', 'trans_time', 'unix_time', 'category', 'amt', 'cc_num', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'merchant', 'merch_lat', 'merch_long']
预处理后的训练数据：
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370703 entries, 0 to 370702
Data columns (total 21 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          370703 non-null  int64  
 1   unix_time   370703 non-null  int64  
 2   category    370703 non-null  int8   
 3   amt         370703 non-null  float64
 4   cc_num      370703 non-null  int64  
 5   gender      370703 non-null  int8   
 6   state       370703 non-null  int8   
 7   zip  

In [7]:
from sklearn.model_selection import GridSearchCV


param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [5, 7, 9],
    'num_leaves': [15, 31, 63],
    'n_estimators': [100, 300, 500]
}


grid_search = GridSearchCV(
    estimator=LGBMClassifier(class_weight='balanced', random_state=42),
    param_grid=param_grid,
    scoring='f1',
    cv=3,
    verbose=2
)

X_train, X_val, y_train, y_val = train_test_split(train.drop(columns=['is_fraud']), train['is_fraud'], test_size=0.2, random_state=42)

grid_search.fit(X_train, y_train)

print("最佳参数：", grid_search.best_params_)

best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)


y_val_pred = best_model.predict(X_val)
print("优化后的 LightGBM 验证集 F1-Score:")
print(f1_score(y_val, y_val_pred))
print("\n分类报告：")
print(classification_report(y_val, y_val_pred))

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
[LightGBM] [Info] Start training from score 0.000000
[CV] END learning_rate=0.05, max_depth=9, n_estimators=500, num_leaves=31; total time=  17.1s
[LightGBM] [Info] Number of positive: 22500, number of negative: 175208
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010721 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3428
[LightGBM] [Info] Number of data points in the train set: 197708, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[CV] END learning_rate=0.05, max_depth=9, n_estimators=500, num_leaves=31; total time=  16.3s
[LightGBM] [Info] Number of positive: 22500, number of negative: 175208
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.

In [8]:
test_pred = best_model.predict(test)

submission = sample_submission.copy()
submission['is_fraud'] = test_pred
submission.to_csv('submission.csv', index=False)

print("提交文件已保存为 submission.csv")

提交文件已保存为 submission.csv
