In [1]:
import sys  # System-specific parameters and functions
import numpy as np  # Fundamental package for scientific computing with Python
import pandas as pd  # Powerful data structures for data manipulation and analysis
from datetime import datetime  # Basic date and time types
import warnings  # Warning control
warnings.filterwarnings('ignore')  # Ignore warnings

In [2]:
df = pd.read_csv(('C:/Users/nguye/Downloads/cs114/data-tbtl/annonimized.csv'))
df = df[(df['is_final'] != 0) | (df['pre_score'] != 10000)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 288863 entries, 0 to 295197
Data columns (total 11 columns):
 #   Column                           Non-Null Count   Dtype 
---  ------                           --------------   ----- 
 0   concat('it001',`assignment_id`)  288863 non-null  object
 1   concat('it001',`problem_id`)     288863 non-null  object
 2   concat('it001', username)        288863 non-null  object
 3   is_final                         288863 non-null  int64 
 4   status                           288863 non-null  object
 5   pre_score                        288863 non-null  int64 
 6   coefficient                      288863 non-null  int64 
 7   concat('it001',`language_id`)    288863 non-null  object
 8   created_at                       288863 non-null  object
 9   updated_at                       288863 non-null  object
 10  judgement                        288863 non-null  object
dtypes: int64(3), object(8)
memory usage: 26.4+ MB


In [3]:
df = df.rename(columns={"concat('it001',`assignment_id`)": 'assignment_id'})
df = df.rename(columns={"concat('it001',`problem_id`)":'problem_id'})
df = df.rename(columns={"concat('it001', username)":'username'})

In [4]:
df = df.drop(["concat('it001',`language_id`)", 'updated_at'], axis=1)

In [5]:
def calculate_frequency_vector(student_df):
    a = pd.to_datetime(student_df['created_at'], format='%m-%d %H:%M:%S', errors='coerce').dt.hour
    a = a.dropna()  # Drop rows with NaT (errors in parsing)

    hour_counts = a.value_counts().sort_index()
    frequency_vector = np.zeros(24)
    for hour, count in hour_counts.items():
        hour = int(hour)
        frequency_vector[hour] = count
    return frequency_vector

# Dictionary to store frequency vectors for each student
frequency_vectors = {}

# Iterate over each unique student and calculate their frequency vector
for username in df["username"].unique():
    student_df = df[df["username"] == username]
    frequency_vectors[username] = calculate_frequency_vector(student_df)

In [6]:
frequency_df = pd.DataFrame.from_dict(frequency_vectors, orient='index', columns=[f'hour_{i}' for i in range(24)]).reset_index()
frequency_df.rename(columns={'index': "username"}, inplace=True)

In [7]:
df = df.merge(frequency_df, on="username")

In [8]:
fixed_year = 2024  # Chọn một năm bất kỳ
df['created_at'] = [f"{fixed_year}-{date}" for date in df['created_at']]

In [9]:
df['created_at'] = pd.to_datetime(df['created_at'])
def adjust_year(date):
    if date.month >= 9:
        return date.replace(year=date.year - 1)
    return date

# Áp dụng hàm này vào cột 'created_at'
df['created_at'] = df['created_at'].apply(adjust_year)

In [10]:
df['created_at'] = pd.to_datetime(df['created_at'])

# Số lượng assignment và problem

In [11]:
num_assigmnet1 = df.groupby(["username"])['assignment_id'].nunique().reset_index().rename(columns={'assignment_id': 'num_assigmnet1'})
num_problem = df.groupby(["username"])["problem_id"].nunique().reset_index().rename(columns={'problem_id': 'num_problem'})
submit_count = df.groupby(["username"])["problem_id"].count().reset_index().rename(columns={'problem_id': 'submit_count'})

# Số lần nộp SCORE và Khác

In [12]:
count_non_score = df[df['status'] != 'SCORE'].groupby("username").size().reset_index().rename(columns={0: 'count_NON_SCORE'})
count_score = df[df['status'] == 'SCORE'].groupby("username").size().reset_index().rename(columns={0: 'count_SCORE'})

# Số lần nộp is_final 

In [13]:
count_is_final = df[df['is_final'] == 1].groupby("username").size().reset_index().rename(columns={0: 'count_is_final'})

# Điểm số trung bình

In [14]:
mean_pre_score = df.groupby(["username"])["pre_score"].mean().reset_index().rename(columns={'pre_score': 'mean_pre_score'})

# Tỉ số số lần nộp và assigment

In [15]:
count_of_assignments = df.groupby(["username"])["problem_id"].count() / df.groupby(["username"])["assignment_id"].nunique()

In [16]:
count_of_assignments = count_of_assignments.reset_index().rename(columns={0: 'count_of_assignments'})

In [17]:
count_of_assignments

Unnamed: 0,username,count_of_assignments
0,00b6dd4fc7eb817e03708c532016ef30ce564a61,19.857143
1,00bef8afee8f3c595d535c9c03c490cac1a4f021,27.333333
2,01122b3ef7e59b84189e65985305f575d6bdf83c,27.857143
3,0134f9f410c65ad0e8c2254a7e9288670e02a183,23.750000
4,013de369c439ab0ead8aa7da64423aa395a8be39,13.250000
...,...,...
1484,feb8a2859a011c59efd22ed419cb69288fe03627,7.555556
1485,fef4a3263ed9a8ab14d457694bb8fd86ccd98312,42.500000
1486,ff12d6e2ab80696ed8e22fbe5497e96c68d29076,72.333333
1487,ff3fa2ec64294f37ae968159f810ebeda7966c51,13.428571


# Tỉ số số lần nộp và problem

In [18]:
count_of_problems = df.groupby(["username"])["problem_id"].count() / df.groupby(["username"])["problem_id"].nunique()
count_of_problems = count_of_problems.reset_index().rename(columns={0: 'count_of_problems'})

# Tỉ lệ làm bài sai

In [19]:
tgia = df.groupby(["username", "problem_id"])["pre_score"].max()
tgia = tgia.groupby(["username"]).apply(lambda x: (x != 10000).sum()) / tgia.groupby(["username"]).count()

In [20]:
tgia = tgia.reset_index().rename(columns={"pre_score": 'ratio_not_10000'})

# Trung bình submit problem

In [21]:
mean_submit_problem = df.groupby(["username", "problem_id"])["problem_id"].count()
mean_submit_problem = mean_submit_problem.groupby("username").mean().reset_index().rename(columns={"problem_id": 'mean_submit_problem'})

# Trung bình submit assignment

In [22]:
mean_submit_assignment = df.groupby(["username", "assignment_id"])["assignment_id"].count()
mean_submit_assignment = mean_submit_problem.groupby("username").mean().reset_index().rename(columns={"assignment_id": 'mean_submit_assignment'})

# Trung bình time problem

In [23]:
mean_time_problem = df.groupby(["username", "problem_id"])['created_at'].agg(lambda x: (x.max() - x.min()).total_seconds() / 3600)
mean_time_problem = (mean_time_problem.groupby(["username"]).mean()/10).reset_index().rename(columns={"created_at": 'mean_submit_problem'})

# Trung bình time assignment

In [24]:
mean_time_assignment = df.groupby(["username", "assignment_id"])['created_at'].agg(lambda x: (x.max() - x.min()).total_seconds() / 3600)
mean_time_assignment = (mean_time_assignment.groupby(["username"]).mean()/10).reset_index().rename(columns={"created_at": 'mean_submit_assignment'})

In [25]:
df = df.merge(num_assigmnet1, on="username")
df = df.merge(num_problem, on="username")
df = df.merge(submit_count, on="username")
df = df.merge(count_non_score, on="username")
df = df.merge(count_score, on="username")
df = df.merge(count_is_final, on="username")
df = df.merge(mean_pre_score, on="username")
df = df.merge(count_of_assignments, on="username")
df = df.merge(tgia, on="username")
df = df.merge(count_of_problems, on="username")
df = df.merge(mean_submit_problem, on="username")
df = df.merge(mean_time_problem, on="username")

In [26]:
df = df.drop(['problem_id_x'],axis = 1)

In [27]:
df = df.drop(['problem_id_y'],axis = 1)

In [28]:
df = df.drop(['assignment_id'],axis = 1)

In [29]:
df.columns

Index(['username', 'is_final', 'status', 'pre_score', 'coefficient',
       'created_at', 'judgement', 'hour_0', 'hour_1', 'hour_2', 'hour_3',
       'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10',
       'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16',
       'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22',
       'hour_23', 'num_assigmnet1', 'num_problem', 'submit_count',
       'count_NON_SCORE', 'count_SCORE', 'count_is_final', 'mean_pre_score',
       'count_of_assignments', 'ratio_not_10000', 'mean_submit_problem_x',
       'mean_submit_problem_y'],
      dtype='object')

In [30]:
!pip install lightgbm catboost



In [31]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import mean_squared_error as MSE
import lightgbm as lgb
import optuna
from sklearn.model_selection import cross_validate
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score
from sklearn.ensemble import VotingRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer

In [32]:
train_term1 = df.drop(['is_final','status','pre_score','coefficient','created_at','judgement'], axis = 1)

In [33]:
test_term1 = train_term1

In [34]:
test_term1

Unnamed: 0,username,hour_0,hour_1,hour_2,hour_3,hour_4,hour_5,hour_6,hour_7,hour_8,...,num_problem,submit_count,count_NON_SCORE,count_SCORE,count_is_final,mean_pre_score,count_of_assignments,ratio_not_10000,mean_submit_problem_x,mean_submit_problem_y
0,ed9eaeb6a707f50154024b24d7efcb874a9795dd,6.0,6.0,1.0,1.0,6.0,8.0,5.0,11.0,17.0,...,58,136,34,102,58,5202.000000,27.200000,0.034483,2.344828,0.177979
1,ed9eaeb6a707f50154024b24d7efcb874a9795dd,6.0,6.0,1.0,1.0,6.0,8.0,5.0,11.0,17.0,...,58,136,34,102,58,5202.000000,27.200000,0.034483,2.344828,0.177979
2,ed9eaeb6a707f50154024b24d7efcb874a9795dd,6.0,6.0,1.0,1.0,6.0,8.0,5.0,11.0,17.0,...,58,136,34,102,58,5202.000000,27.200000,0.034483,2.344828,0.177979
3,ed9eaeb6a707f50154024b24d7efcb874a9795dd,6.0,6.0,1.0,1.0,6.0,8.0,5.0,11.0,17.0,...,58,136,34,102,58,5202.000000,27.200000,0.034483,2.344828,0.177979
4,ed9eaeb6a707f50154024b24d7efcb874a9795dd,6.0,6.0,1.0,1.0,6.0,8.0,5.0,11.0,17.0,...,58,136,34,102,58,5202.000000,27.200000,0.034483,2.344828,0.177979
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
288284,232cce96362898f08e9150ba244adaf2d6583ab2,0.0,0.0,10.0,7.0,2.0,8.0,4.0,6.0,20.0,...,98,216,45,171,98,5680.976852,19.636364,0.020408,2.204082,1.677821
288285,232cce96362898f08e9150ba244adaf2d6583ab2,0.0,0.0,10.0,7.0,2.0,8.0,4.0,6.0,20.0,...,98,216,45,171,98,5680.976852,19.636364,0.020408,2.204082,1.677821
288286,232cce96362898f08e9150ba244adaf2d6583ab2,0.0,0.0,10.0,7.0,2.0,8.0,4.0,6.0,20.0,...,98,216,45,171,98,5680.976852,19.636364,0.020408,2.204082,1.677821
288287,232cce96362898f08e9150ba244adaf2d6583ab2,0.0,0.0,10.0,7.0,2.0,8.0,4.0,6.0,20.0,...,98,216,45,171,98,5680.976852,19.636364,0.020408,2.204082,1.677821


In [35]:
test_term1.drop_duplicates(subset='username', keep='first', inplace=True)

In [36]:
qt_train = pd.read_csv('C:/Users/nguye/Downloads/cs114/public_it001/qt-public.csv')

In [37]:
qt_train.isnull().sum()
qt_train = qt_train.rename(columns={"hash":'username'})
qt_train.dropna(inplace=True)
train_term1 = train_term1.merge(qt_train, on='username', how='inner')
train_term1.drop_duplicates(subset='username', keep='first', inplace=True)
train_term1['diemqt'] = train_term1['diemqt'].str.replace('\xa0', ' ', regex=True)
train_term1['diemqt'] = train_term1['diemqt'].replace(' ', np.nan).astype(float)
train_term1.dropna(inplace=True)

In [38]:
train_term1.columns

Index(['username', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5',
       'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12',
       'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18',
       'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'num_assigmnet1',
       'num_problem', 'submit_count', 'count_NON_SCORE', 'count_SCORE',
       'count_is_final', 'mean_pre_score', 'count_of_assignments',
       'ratio_not_10000', 'mean_submit_problem_x', 'mean_submit_problem_y',
       'diemqt'],
      dtype='object')

In [39]:
X_train= train_term1.drop(columns=["diemqt", "username"])
y = train_term1['diemqt']
y = y.astype(float)

In [40]:
y = np.asarray(y)
X_pca = np.asarray(X_train)

In [41]:
def objective_lgb(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "n_estimators": 1000,
        "verbosity": -1,
        "bagging_freq": 1,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        'device': 'gpu',
    }
    cv = KFold(n_splits=5, shuffle=True,random_state = 42)
    r2_list = []

    for train_index, test_index in cv.split(X_pca,y):
        X_train_fold, X_test = X_pca[train_index], X_pca[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model = lgb.LGBMRegressor(**params)

        model.fit(X_train_fold, y_train)
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        r2_list.append(r2)

    return np.mean(r2_list)

In [42]:
def objective_cat(trial):
    params = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'depth': trial.suggest_int('depth', 3, 8),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10),
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'eval_metric': 'RMSE',
        'random_seed': 42,
        'verbose': False,
        'loss_function':'RMSE',
        'task_type': 'GPU'
    }

    cv = KFold(n_splits=5, shuffle=True,random_state = 42)
    r2_list = []

    for train_index, test_index in cv.split(X_pca,y):
        X_train_fold, X_test = X_pca[train_index], X_pca[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model = CatBoostRegressor(**params)
        model.fit(X_train_fold, y_train, eval_set=(X_test, y_test), early_stopping_rounds=10)
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        r2_list.append(r2)

    return np.mean(r2_list)

In [44]:
study_lgb = optuna.create_study(direction="minimize")
study_lgb.optimize(objective_lgb, n_trials=30)
print('Best parameters for LightGBM:', study_lgb.best_params)

[I 2025-01-18 19:58:45,634] A new study created in memory with name: no-name-79fb644d-cda7-4840-8954-8c3bad65ed53
[I 2025-01-18 19:58:52,626] Trial 0 finished with value: -0.03625634375914326 and parameters: {'learning_rate': 0.05776366181234697, 'num_leaves': 927, 'subsample': 0.5773614548089785, 'colsample_bytree': 0.3635131933752127, 'min_data_in_leaf': 98}. Best is trial 0 with value: -0.03625634375914326.
[I 2025-01-18 19:59:03,115] Trial 1 finished with value: -0.05988265084238618 and parameters: {'learning_rate': 0.06004333977044039, 'num_leaves': 293, 'subsample': 0.6500001314835424, 'colsample_bytree': 0.32043721165060185, 'min_data_in_leaf': 67}. Best is trial 1 with value: -0.05988265084238618.
[I 2025-01-18 20:05:52,971] Trial 2 finished with value: 0.10033986700710544 and parameters: {'learning_rate': 0.009791699485086462, 'num_leaves': 144, 'subsample': 0.9068572153199825, 'colsample_bytree': 0.8449066996561186, 'min_data_in_leaf': 2}. Best is trial 1 with value: -0.05988

Best parameters for LightGBM: {'learning_rate': 0.09900506184613107, 'num_leaves': 805, 'subsample': 0.7056668116969942, 'colsample_bytree': 0.7725518063734025, 'min_data_in_leaf': 92}


In [45]:
study_cat = optuna.create_study(direction='minimize')
study_cat.optimize(objective_cat, n_trials=30)
print('Best parameters for CatBoost:', study_cat.best_params)

[I 2025-01-18 20:19:16,298] A new study created in memory with name: no-name-2febe23d-f464-4265-9d47-b8ff63006b0d
[I 2025-01-18 20:19:28,710] Trial 0 finished with value: 0.14046252271930365 and parameters: {'learning_rate': 0.013378417181825749, 'depth': 8, 'l2_leaf_reg': 0.002030950700662506, 'iterations': 136}. Best is trial 0 with value: 0.14046252271930365.
[I 2025-01-18 20:19:59,497] Trial 1 finished with value: 0.13304677331762713 and parameters: {'learning_rate': 0.014575975393457697, 'depth': 6, 'l2_leaf_reg': 2.6719743600007413, 'iterations': 812}. Best is trial 1 with value: 0.13304677331762713.
[I 2025-01-18 20:20:13,944] Trial 2 finished with value: 0.12493356675008016 and parameters: {'learning_rate': 0.031491815545175, 'depth': 3, 'l2_leaf_reg': 0.008857410722098029, 'iterations': 969}. Best is trial 2 with value: 0.12493356675008016.
[I 2025-01-18 20:21:06,692] Trial 3 finished with value: 0.13799652329170634 and parameters: {'learning_rate': 0.014038404540551387, 'dept

Best parameters for CatBoost: {'learning_rate': 0.010244274649330545, 'depth': 4, 'l2_leaf_reg': 0.03099210257309991, 'iterations': 793}


In [46]:
lgb_best_params = study_lgb.best_params
cat_best_params = study_cat.best_params

kf = KFold(n_splits=5, shuffle=True, random_state=42)
voting_r2 = []

for train_index, test_index in kf.split(X_pca):
    X_train_fold, X_test = X_pca[train_index], X_pca[test_index]
    y_train, y_test = y[train_index], y[test_index]

    lgb_model = lgb.LGBMRegressor(**lgb_best_params)
    cat_model = CatBoostRegressor(**cat_best_params)

    voting_model = VotingRegressor(estimators=[
        ('lgb', lgb_model),
        ('cat', cat_model)
    ])
    cat_model.fit(X_train_fold, y_train)
    lgb_model.fit(X_train_fold, y_train)
    voting_model.fit(X_train_fold, y_train)
    y_pred = voting_model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    voting_r2.append(r2)

print(f'Voting Regressor R^2: {np.mean(voting_r2):.4f} ± {np.std(voting_r2):.4f}')


0:	learn: 1.7166086	total: 1.69ms	remaining: 1.34s
1:	learn: 1.7129084	total: 3.26ms	remaining: 1.29s
2:	learn: 1.7103398	total: 4.65ms	remaining: 1.22s
3:	learn: 1.7086660	total: 6.08ms	remaining: 1.2s
4:	learn: 1.7066073	total: 7.42ms	remaining: 1.17s
5:	learn: 1.7037721	total: 8.73ms	remaining: 1.14s
6:	learn: 1.7007839	total: 10.2ms	remaining: 1.15s
7:	learn: 1.6980479	total: 11.7ms	remaining: 1.14s
8:	learn: 1.6958355	total: 13.4ms	remaining: 1.17s
9:	learn: 1.6939457	total: 15.2ms	remaining: 1.19s
10:	learn: 1.6926726	total: 17.2ms	remaining: 1.22s
11:	learn: 1.6906589	total: 18.7ms	remaining: 1.22s
12:	learn: 1.6882200	total: 20.1ms	remaining: 1.2s
13:	learn: 1.6869361	total: 21.4ms	remaining: 1.19s
14:	learn: 1.6851233	total: 22.9ms	remaining: 1.19s
15:	learn: 1.6828316	total: 24.2ms	remaining: 1.17s
16:	learn: 1.6805968	total: 25.6ms	remaining: 1.17s
17:	learn: 1.6778106	total: 27ms	remaining: 1.16s
18:	learn: 1.6765317	total: 28.4ms	remaining: 1.16s
19:	learn: 1.6752339	total

In [47]:
import joblib

In [48]:
joblib.dump(voting_model, 'voting_regressor.joblib')
joblib.dump(lgb_model, 'LGBMRegressor.joblib')
joblib.dump(cat_model, 'CatBoostRegressor.joblib')

['CatBoostRegressor.joblib']

In [49]:
common_cols = train_term1.columns.intersection(test_term1.columns)

# Tạo DataFrame mới chỉ chứa các cột chung
train_term1_common = train_term1[common_cols]
df_test_common = test_term1[common_cols]

# Tìm các hàng khác nhau
different_rows = pd.concat([train_term1_common, df_test_common]).drop_duplicates(keep=False)

different_rows.reset_index(drop=True, inplace=True)
different_rows.head(5)

Unnamed: 0,username,hour_0,hour_1,hour_2,hour_3,hour_4,hour_5,hour_6,hour_7,hour_8,...,num_problem,submit_count,count_NON_SCORE,count_SCORE,count_is_final,mean_pre_score,count_of_assignments,ratio_not_10000,mean_submit_problem_x,mean_submit_problem_y
0,ed9eaeb6a707f50154024b24d7efcb874a9795dd,6.0,6.0,1.0,1.0,6.0,8.0,5.0,11.0,17.0,...,58,136,34,102,58,5202.0,27.2,0.034483,2.344828,0.177979
1,ba12c0a2cb367af0467e479c03507c71a805d291,1.0,3.0,7.0,10.0,25.0,5.0,16.0,28.0,36.0,...,112,313,74,239,117,5137.111821,52.166667,0.0,2.794643,4.387548
2,b7298b0fe50443a623af9b56792b330c2d052845,0.0,24.0,28.0,39.0,22.0,2.0,2.0,4.0,7.0,...,110,246,36,210,110,5982.589431,49.2,0.018182,2.236364,1.547971
3,c60be70309789b39355dc612f36e37090ccad5dc,0.0,4.0,20.0,13.0,9.0,4.0,5.0,10.0,11.0,...,83,172,26,146,83,6237.773256,17.2,0.084337,2.072289,0.556678
4,a22a58c5be8aa2c2700619e37f2b7a6e4efa7e6b,0.0,6.0,0.0,0.0,1.0,1.0,19.0,38.0,55.0,...,103,277,38,239,103,5370.649819,25.181818,0.048544,2.68932,0.961238


In [50]:
X_test= different_rows.drop(columns=["username"])

In [51]:
X_pca = np.asarray(X_test)

In [52]:
y_pre1 = lgb_model.predict(X_pca)
y_pre2 = cat_model.predict(X_pca)
y_pre3 = voting_model.predict(X_pca)

In [53]:
username = different_rows['username'].to_list()
results1 = []
for i, value in enumerate(y_pre1):
    results1.append((username[i], value))
df = pd.DataFrame(results1, columns=['file_name', 'label'])
df.to_csv('outputqt_1.csv', index=False, header=False)

In [54]:
results2 = []
for i, value in enumerate(y_pre2):
    results2.append((username[i], value))
df = pd.DataFrame(results2, columns=['file_name', 'label'])
df.to_csv('outputqt_2.csv', index=False, header=False)

In [55]:
results3 = []
for i, value in enumerate(y_pre3):
    results3.append((username[i], value))
df = pd.DataFrame(results3, columns=['file_name', 'label'])
df.to_csv('outputqt_3.csv', index=False, header=False)