In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("data-files/train.csv")
test = pd.read_csv("data-files/test.csv")
submission = pd.read_csv("data-files/sample_submission.csv")

In [3]:
train.head()

Unnamed: 0,ID,설립연도,국가,분야,투자단계,직원 수,인수여부,상장여부,고객수(백만명),총 투자금(억원),연매출(억원),SNS 팔로워 수(백만명),기업가치(백억원),성공확률
0,TRAIN_0000,2009,CT005,이커머스,Series A,4126.0,No,No,56.0,3365.0,4764.0,4.71,,0.3
1,TRAIN_0001,2023,CT006,핀테크,Seed,4167.0,Yes,No,80.0,4069.0,279.0,1.0,2500-3500,0.8
2,TRAIN_0002,2018,CT007,기술,Series A,3132.0,Yes,Yes,54.0,6453.0,12141.0,4.0,3500-4500,0.5
3,TRAIN_0003,2016,CT006,,Seed,3245.0,Yes,Yes,,665.0,10547.0,2.97,,0.7
4,TRAIN_0004,2020,CT002,에듀테크,Seed,1969.0,No,Yes,94.0,829.0,9810.0,1.0,1500-2500,0.1


In [4]:
train.isnull().sum()

ID                   0
설립연도                 0
국가                   0
분야                 857
투자단계                 0
직원 수               174
인수여부                 0
상장여부                 0
고객수(백만명)          1320
총 투자금(억원)            0
연매출(억원)              0
SNS 팔로워 수(백만명)       0
기업가치(백억원)         1220
성공확률                 0
dtype: int64

In [5]:
except_cols = ["ID", "성공확률"]
X_train = train.drop(columns=except_cols)
X_test = test.drop(columns = "ID")
y_train = train["성공확률"]

In [6]:
X_train['기업나이'] = 2025 - train['설립연도']
X_test['기업나이'] = 2025 - test['설립연도']
X_train.drop(columns=['설립연도'], inplace=True)
X_test.drop(columns=['설립연도'], inplace=True)

In [7]:
cat_cols = X_train.select_dtypes(include="object").columns.tolist()

In [8]:
cat_cols

['국가', '분야', '투자단계', '인수여부', '상장여부', '기업가치(백억원)']

In [9]:
X_train = pd.get_dummies(X_train, columns=cat_cols)
X_test = pd.get_dummies(X_test, columns=cat_cols)

In [10]:
X_train.isnull().sum()

직원 수                    174
고객수(백만명)               1320
총 투자금(억원)                 0
연매출(억원)                   0
SNS 팔로워 수(백만명)            0
기업나이                      0
국가_CT001                  0
국가_CT002                  0
국가_CT003                  0
국가_CT004                  0
국가_CT005                  0
국가_CT006                  0
국가_CT007                  0
국가_CT008                  0
국가_CT009                  0
국가_CT010                  0
분야_AI                     0
분야_게임                     0
분야_기술                     0
분야_물류                     0
분야_에너지                    0
분야_에듀테크                   0
분야_이커머스                   0
분야_푸드테크                   0
분야_핀테크                    0
분야_헬스케어                   0
투자단계_IPO                  0
투자단계_Seed                 0
투자단계_Series A             0
투자단계_Series B             0
투자단계_Series C             0
인수여부_No                   0
인수여부_Yes                  0
상장여부_No                   0
상장여부_Yes                  0
기업가치(백억원)_1500-2500 

In [12]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

def fill_by_nearest_mean(df, target_col, reference_col, k=10):
    df_copy = df.copy()
    known = df_copy[df_copy[target_col].notnull()]
    unknown = df_copy[df_copy[target_col].isnull()]

    # Nearest neighbors 기준은 연매출액
    nbrs = NearestNeighbors(n_neighbors=k, algorithm='auto')
    nbrs.fit(known[[reference_col]])

    for idx in unknown.index:
        value = df_copy.loc[idx, reference_col]
        if np.isnan(value): continue  # 기준값도 비어있으면 스킵

        # 가까운 연매출을 가진 데이터 k개 선택
        distances, indices = nbrs.kneighbors([[value]])
        neighbor_vals = known.iloc[indices[0]][target_col]

        # 평균으로 채우기
        df_copy.loc[idx, target_col] = neighbor_vals.mean()

    return df_copy[target_col]

In [13]:
X_train['직원 수'] = fill_by_nearest_mean(X_train, '직원 수', '연매출(억원)', k=10)
X_test['직원 수'] = fill_by_nearest_mean(X_test, '직원 수', '연매출(억원)', k=10)

X_train['고객수(백만명)'] = fill_by_nearest_mean(X_train, '고객수(백만명)', '연매출(억원)', k=10)
X_test['고객수(백만명)'] = fill_by_nearest_mean(X_test, '고객수(백만명)', '연매출(억원)', k=10)




In [21]:
X_test

Unnamed: 0,직원 수,고객수(백만명),총 투자금(억원),연매출(억원),SNS 팔로워 수(백만명),기업나이,국가_CT001,국가_CT002,국가_CT003,국가_CT004,...,투자단계_Series C,인수여부_No,인수여부_Yes,상장여부_No,상장여부_Yes,기업가치(백억원)_1500-2500,기업가치(백억원)_2500-3500,기업가치(백억원)_3500-4500,기업가치(백억원)_4500-6000,기업가치(백억원)_6000이상
0,3261.0,45.0,5021.0,6680.0,2.00,23,False,False,False,False,...,True,True,False,False,True,True,False,False,False,False
1,3707.0,70.0,1601.0,4654.0,4.20,5,True,False,False,False,...,True,False,True,True,False,False,False,False,False,False
2,236.0,89.0,4709.0,9289.0,1.00,11,False,False,False,False,...,False,False,True,False,True,False,False,False,False,True
3,637.0,17.0,2145.0,7005.0,5.00,22,True,False,False,False,...,False,False,True,False,True,True,False,False,False,False
4,4922.0,68.0,4995.0,7593.0,4.36,19,False,False,False,False,...,False,False,True,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1750,2870.0,49.0,539.0,1942.0,4.00,10,False,False,True,False,...,False,False,True,False,True,True,False,False,False,False
1751,278.0,35.0,2373.0,10847.0,3.00,19,True,False,False,False,...,False,False,True,False,True,False,False,False,True,False
1752,1478.0,96.0,4215.0,8297.0,3.00,23,True,False,False,False,...,False,True,False,False,True,True,False,False,False,False
1753,3570.0,59.0,3333.0,1399.0,5.00,3,False,False,False,False,...,False,True,False,True,False,False,False,False,True,False


In [22]:
from sklearn.ensemble import RandomForestRegressor

In [23]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [24]:
y_pred

array([0.505, 0.482, 0.517, ..., 0.557, 0.53 , 0.478], shape=(1755,))

In [25]:
submission = pd.DataFrame({
    'ID': test['ID'],             # test 데이터에 있는 ID 컬럼
    '성공확률': y_pred
})

In [26]:
submission

Unnamed: 0,ID,성공확률
0,TEST_0000,0.505
1,TEST_0001,0.482
2,TEST_0002,0.517
3,TEST_0003,0.566
4,TEST_0004,0.638
...,...,...
1750,TEST_1750,0.525
1751,TEST_1751,0.583
1752,TEST_1752,0.557
1753,TEST_1753,0.530


In [27]:
submission.to_csv("data-files/submission_RandomForest5.csv",index=False)