In [46]:
import pandas as pd

# 데이터 불러오기
train_data = pd.read_csv('C:/Users/82106/Desktop/데이터분석 프로젝트/소득예측/open/open/train.csv')

# 데이터프레임의 각 열의 데이터 타입 확인
data_types = train_data.dtypes

# 데이터 타입이 'object'인 열은 범주형 변수로 간주됩니다.
categorical_variables = data_types[data_types == 'object'].index.tolist()

# 범주형 변수 목록 출력
print("범주형 변수:", categorical_variables)



범주형 변수: ['ID', 'Gender', 'Education_Status', 'Employment_Status', 'Industry_Status', 'Occupation_Status', 'Race', 'Hispanic_Origin', 'Martial_Status', 'Household_Status', 'Household_Summary', 'Citizenship', 'Birth_Country', 'Birth_Country (Father)', 'Birth_Country (Mother)', 'Tax_Status', 'Income_Status']


#### 범주형 변수 라벨 인코딩

In [47]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 라벨 인코더 생성 및 학습
label_encoders = {}
for column in categorical_variables:
    label_encoders[column] = LabelEncoder()
    train_data[column] = label_encoders[column].fit_transform(train_data[column])

# 변환된 데이터 출력
print(train_data.head())

   ID  Age  Gender  Education_Status  Employment_Status  \
0   0   63       1                15                  2   
1   1   37       1                 1                  2   
2   2   58       0                12                  2   
3   3   44       1                12                  2   
4   4   37       0                12                  2   

   Working_Week (Yearly)  Industry_Status  Occupation_Status  Race  \
0                      4               20                 11     4   
1                     52                6                 11     4   
2                     52               11                  0     1   
3                     52               19                 12     4   
4                     52               19                 10     4   

   Hispanic_Origin  ...  Citizenship  Birth_Country  Birth_Country (Father)  \
0                0  ...            2             39                      39   
1                0  ...            2             39               

#### 결측치 확인하기

In [48]:
# 각 열의 결측치 개수 확인
missing_values = train_data.isnull().sum()

# 결측치가 있는 열 출력
columns_with_missing_values = missing_values[missing_values > 0].index.tolist()
print("결측치가 있는 열:", columns_with_missing_values)

# 각 열의 결측치 비율 확인
missing_percentage = (missing_values / len(train_data)) * 100

# 결측치 비율이 0% 이상인 열 출력
columns_with_high_missing_percentage = missing_percentage[missing_percentage > 0].index.tolist()
print("결측치 비율이 0% 이상인 열:", columns_with_high_missing_percentage)

결측치가 있는 열: []
결측치 비율이 0% 이상인 열: []


##### ID 열은 drop 하기

In [49]:
# 'ID' 열 삭제
train_data.drop(columns=['ID'], inplace=True)

# 삭제된 열을 제외한 데이터 확인
print(train_data.head())

   Age  Gender  Education_Status  Employment_Status  Working_Week (Yearly)  \
0   63       1                15                  2                      4   
1   37       1                 1                  2                     52   
2   58       0                12                  2                     52   
3   44       1                12                  2                     52   
4   37       0                12                  2                     52   

   Industry_Status  Occupation_Status  Race  Hispanic_Origin  Martial_Status  \
0               20                 11     4                0               1   
1                6                 11     4                0               4   
2               11                  0     1                0               1   
3               19                 12     4                0               0   
4               19                 10     4                0               0   

   ...  Citizenship  Birth_Country  Birth_Country 

##### 타겟변수 분리 및 특성 선택

In [50]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# 예측에 사용할 특성과 타겟 변수 분리
X = train_data.drop(columns=['Income'])  # 특성 변수
y = train_data['Income']  # 타겟 변수

# 상관관계를 기반으로 특성 선택
selector = SelectKBest(score_func=f_classif, k=10)  # k개의 특성 선택
X_selected = selector.fit_transform(X, y)

# 선택된 특성 인덱스 확인
selected_features_index = selector.get_support(indices=True)

# 선택된 특성 변수명 확인
selected_features = X.columns[selected_features_index]

# 선택된 특성 출력
print(selected_features)


Index(['Gender', 'Education_Status', 'Working_Week (Yearly)',
       'Industry_Status', 'Occupation_Status', 'Martial_Status',
       'Household_Status', 'Household_Summary', 'Tax_Status', 'Income_Status'],
      dtype='object')


##### 모델선택 

##### 선형회귀

In [51]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [52]:
# 데이터를 학습용과 검증용으로 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 선형 회귀 모델 초기화
model = LinearRegression()

# 모델 학습
model.fit(X_train, y_train)

In [53]:
# 검증 데이터 예측
y_val_pred = model.predict(X_val)

# 검증 데이터 평가 (평균 제곱 오차 사용)
val_mse = mean_squared_error(y_val, y_val_pred)
val_rmse = np.sqrt(val_mse)

# 평균 제곱 오차 출력
print("검증 데이터 RMSE:", val_rmse)

검증 데이터 RMSE: 624.048707557916


##### test.csv 로 예측한뒤 sample_submission에 저장하기

##### test.csv 전처리

In [54]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 테스트 데이터 불러오기
test_data = pd.read_csv('C:/Users/82106/Desktop/데이터분석 프로젝트/소득예측/open/open/test.csv')

# ID 열 삭제
test_data.drop(columns=['ID'], inplace=True)


In [55]:
# 범주형 변수 선택
categorical_variables = test_data.select_dtypes(include=['object']).columns

# 라벨 인코더 생성 및 적용
label_encoders = {}
for column in categorical_variables:
    label_encoders[column] = LabelEncoder()
    test_data[column] = label_encoders[column].fit_transform(test_data[column])


In [69]:
test_data

Unnamed: 0,Age,Gender,Education_Status,Employment_Status,Working_Week (Yearly),Industry_Status,Occupation_Status,Race,Hispanic_Origin,Martial_Status,...,Citizenship,Birth_Country,Birth_Country (Father),Birth_Country (Mother),Tax_Status,Gains,Losses,Dividends,Income_Status,Age_WorkingWeek_Ratio
0,0.877778,1.0,0.5625,0.000000,0.000000,0.608696,1.000000,1.00,0.000000,0.833333,...,0.5,0.926829,0.951220,0.952381,1.0,0.0,0.0,0.0,0.5,inf
1,0.522222,1.0,0.4375,0.000000,0.000000,0.608696,1.000000,1.00,0.888889,0.833333,...,0.5,0.926829,0.926829,0.928571,0.8,0.0,0.0,0.0,0.5,inf
2,0.200000,0.0,0.5625,0.000000,1.000000,0.826087,0.785714,1.00,0.000000,0.833333,...,0.5,0.926829,0.926829,0.928571,1.0,0.0,0.0,0.0,0.5,0.200000
3,0.433333,0.0,0.0625,0.285714,0.576923,0.521739,0.785714,1.00,0.000000,0.166667,...,0.5,0.926829,0.926829,0.928571,0.4,0.0,0.0,0.0,1.0,0.751111
4,0.066667,1.0,0.1875,0.000000,0.000000,0.608696,1.000000,1.00,0.666667,0.833333,...,0.5,0.926829,0.926829,0.928571,0.8,0.0,0.0,0.0,1.0,inf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.344444,1.0,0.1250,0.000000,1.000000,0.652174,0.857143,1.00,0.000000,0.833333,...,0.5,0.926829,0.926829,0.928571,1.0,0.0,0.0,0.0,0.5,0.344444
9996,0.300000,1.0,0.2500,0.000000,1.000000,0.826087,0.285714,1.00,0.000000,0.166667,...,0.5,0.926829,0.926829,0.928571,0.4,0.0,0.0,0.0,0.5,0.300000
9997,0.200000,1.0,0.7500,0.000000,0.134615,0.826087,0.785714,0.25,0.000000,0.833333,...,0.5,0.926829,0.926829,0.928571,0.8,0.0,0.0,0.0,0.5,1.485714
9998,0.100000,1.0,0.1875,0.000000,0.000000,0.608696,1.000000,1.00,0.000000,0.833333,...,0.5,0.926829,0.926829,0.928571,0.8,0.0,0.0,0.0,0.5,inf


##### 테스트 데이터로 모델을 이용해 예측 수행

In [57]:
# 테스트 데이터에 대한 예측 수행
y_test_pred = model.predict(test_data)


In [58]:
y_test_pred

array([151.50674792,  16.53121674, 516.07641627, ..., 213.90004146,
       110.92986709, 711.92796539])

#### sample_submission에 예측데이터 저장

In [59]:
# 예측 결과를 sample_submission.csv 파일에 채우기
sample_submission = pd.read_csv('C:/Users/82106/Desktop/데이터분석 프로젝트/소득예측/open/open/sample_submission.csv')
sample_submission['Income'] = y_test_pred

In [60]:
sample_submission.to_csv('C:/Users/82106/Desktop/데이터분석 프로젝트/소득예측/open/open/sample_submission3.csv', index=False)

#### 피처 엔지니어링

##### trian data 수치형 변수 Min Max 스케일링

In [61]:
from sklearn.preprocessing import MinMaxScaler

# 수치형 변수 선택
numeric_variables = train_data.select_dtypes(include=['int', 'float']).columns

# MinMax 스케일러 초기화
scaler = MinMaxScaler()

# 수치형 변수에 대해 MinMax 스케일링 적용
train_data[numeric_variables] = scaler.fit_transform(train_data[numeric_variables])

# 스케일링된 테스트 데이터 출력
print(train_data.head())

        Age  Gender  Education_Status  Employment_Status  \
0  0.700000     1.0            0.9375           0.285714   
1  0.411111     1.0            0.0625           0.285714   
2  0.644444     0.0            0.7500           0.285714   
3  0.488889     1.0            0.7500           0.285714   
4  0.411111     0.0            0.7500           0.285714   

   Working_Week (Yearly)  Industry_Status  Occupation_Status  Race  \
0               0.076923         0.869565           0.785714  1.00   
1               1.000000         0.260870           0.785714  1.00   
2               1.000000         0.478261           0.000000  0.25   
3               1.000000         0.826087           0.857143  1.00   
4               1.000000         0.826087           0.714286  1.00   

   Hispanic_Origin  Martial_Status  ...  Citizenship  Birth_Country  \
0              0.0        0.166667  ...          0.5       0.928571   
1              0.0        0.666667  ...          0.5       0.928571   
2    

##### test data 스케일링

In [62]:
from sklearn.preprocessing import MinMaxScaler

# 수치형 변수 선택
numeric_variables = test_data.select_dtypes(include=['int', 'float']).columns

# MinMax 스케일러 초기화
scaler = MinMaxScaler()

# 수치형 변수에 대해 MinMax 스케일링 적용
test_data[numeric_variables] = scaler.fit_transform(test_data[numeric_variables])

# 스케일링된 테스트 데이터 출력
print(test_data.head())

        Age  Gender  Education_Status  Employment_Status  \
0  0.877778     1.0            0.5625           0.000000   
1  0.522222     1.0            0.4375           0.000000   
2  0.200000     0.0            0.5625           0.000000   
3  0.433333     0.0            0.0625           0.285714   
4  0.066667     1.0            0.1875           0.000000   

   Working_Week (Yearly)  Industry_Status  Occupation_Status  Race  \
0               0.000000         0.608696           1.000000   1.0   
1               0.000000         0.608696           1.000000   1.0   
2               1.000000         0.826087           0.785714   1.0   
3               0.576923         0.521739           0.785714   1.0   
4               0.000000         0.608696           1.000000   1.0   

   Hispanic_Origin  Martial_Status  ...  Household_Summary  Citizenship  \
0         0.000000        0.833333  ...           0.571429          0.5   
1         0.888889        0.833333  ...           0.000000          

#### train_data 피처 엔지니어링

In [63]:
# 예시 1: 'Age'와 'Working_Week (Yearly)'를 조합한 새로운 피처 생성
train_data['Age_WorkingWeek_Ratio'] = train_data['Age'] / train_data['Working_Week (Yearly)']


# 변경된 데이터의 처음 몇 행을 출력하여 확인
print(train_data.head())

        Age  Gender  Education_Status  Employment_Status  \
0  0.700000     1.0            0.9375           0.285714   
1  0.411111     1.0            0.0625           0.285714   
2  0.644444     0.0            0.7500           0.285714   
3  0.488889     1.0            0.7500           0.285714   
4  0.411111     0.0            0.7500           0.285714   

   Working_Week (Yearly)  Industry_Status  Occupation_Status  Race  \
0               0.076923         0.869565           0.785714  1.00   
1               1.000000         0.260870           0.785714  1.00   
2               1.000000         0.478261           0.000000  0.25   
3               1.000000         0.826087           0.857143  1.00   
4               1.000000         0.826087           0.714286  1.00   

   Hispanic_Origin  Martial_Status  ...  Birth_Country  \
0              0.0        0.166667  ...       0.928571   
1              0.0        0.666667  ...       0.928571   
2              0.0        0.166667  ...     

#### test data 피처 엔지니어링

In [64]:
# 예시 1: 'Age'와 'Working_Week (Yearly)'를 조합한 새로운 피처 생성
test_data['Age_WorkingWeek_Ratio'] = test_data['Age'] / test_data['Working_Week (Yearly)']


# 변경된 데이터의 처음 몇 행을 출력하여 확인
print(test_data.head())

        Age  Gender  Education_Status  Employment_Status  \
0  0.877778     1.0            0.5625           0.000000   
1  0.522222     1.0            0.4375           0.000000   
2  0.200000     0.0            0.5625           0.000000   
3  0.433333     0.0            0.0625           0.285714   
4  0.066667     1.0            0.1875           0.000000   

   Working_Week (Yearly)  Industry_Status  Occupation_Status  Race  \
0               0.000000         0.608696           1.000000   1.0   
1               0.000000         0.608696           1.000000   1.0   
2               1.000000         0.826087           0.785714   1.0   
3               0.576923         0.521739           0.785714   1.0   
4               0.000000         0.608696           1.000000   1.0   

   Hispanic_Origin  Martial_Status  ...  Citizenship  Birth_Country  \
0         0.000000        0.833333  ...          0.5       0.926829   
1         0.888889        0.833333  ...          0.5       0.926829   
2    

#### 결측치 처리

In [68]:
train_data.isnull().sum()

Age                         0
Gender                      0
Education_Status            0
Employment_Status           0
Working_Week (Yearly)       0
Industry_Status             0
Occupation_Status           0
Race                        0
Hispanic_Origin             0
Martial_Status              0
Household_Status            0
Household_Summary           0
Citizenship                 0
Birth_Country               0
Birth_Country (Father)      0
Birth_Country (Mother)      0
Tax_Status                  0
Gains                       0
Losses                      0
Dividends                   0
Income_Status               0
Income                      0
Age_WorkingWeek_Ratio     145
dtype: int64

In [71]:
# 결측치를 각 열의 평균값으로 대체
train_data.fillna(train_data.mean(), inplace=True)

In [72]:
# 결측치를 각 열의 평균값으로 대체
test_data.fillna(train_data.mean(), inplace=True)

##### Age_WorkingWeek_Ratio 의 무한대값 평균값으로 대체

In [86]:
# 'Age_WorkingWeek_Ratio' 열에서 무한대 값을 NaN으로 대체
train_data['Age_WorkingWeek_Ratio'] = train_data['Age_WorkingWeek_Ratio'].replace([np.inf, -np.inf], np.nan)

# 'Age_WorkingWeek_Ratio' 열의 결측치를 해당 열의 평균값으로 대체
mean_age_workingweek_ratio = train_data['Age_WorkingWeek_Ratio'].mean()
train_data['Age_WorkingWeek_Ratio'].fillna(mean_age_workingweek_ratio, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Age_WorkingWeek_Ratio'].fillna(mean_age_workingweek_ratio, inplace=True)


In [90]:
# 'Age_WorkingWeek_Ratio' 열에서 무한대 값을 NaN으로 대체
test_data['Age_WorkingWeek_Ratio'] = test_data['Age_WorkingWeek_Ratio'].replace([np.inf, -np.inf], np.nan)

# 'Age_WorkingWeek_Ratio' 열의 결측치를 해당 열의 평균값으로 대체
mean_age_workingweek_ratio = test_data['Age_WorkingWeek_Ratio'].mean()
test_data['Age_WorkingWeek_Ratio'].fillna(mean_age_workingweek_ratio, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['Age_WorkingWeek_Ratio'].fillna(mean_age_workingweek_ratio, inplace=True)


##### 훈련데이터 모델 학습 , 평가

In [88]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# 예측에 사용할 특성과 타겟 변수 분리
X = train_data.drop(columns=['Income'])  # 특성 변수
y = train_data['Income']  # 타겟 변수

In [89]:
# 데이터를 학습용과 검증용으로 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 선형 회귀 모델 초기화
model = LinearRegression()

# 모델 학습
model.fit(X_train, y_train)

In [91]:
# 검증 데이터 예측
y_val_pred = model.predict(X_val)

# 검증 데이터 평가 (평균 제곱 오차 사용)
val_mse = mean_squared_error(y_val, y_val_pred)
val_rmse = np.sqrt(val_mse)

# 평균 제곱 오차 출력
print("검증 데이터 RMSE:", val_rmse)

검증 데이터 RMSE: 0.062395506015311596


##### 테스트 데이터로 예측 수행

In [92]:
# 테스트 데이터에 대한 예측 수행
y_test_pred = model.predict(test_data)

In [93]:
# 예측 결과를 sample_submission.csv 파일에 채우기
sample_submission = pd.read_csv('C:/Users/82106/Desktop/데이터분석 프로젝트/소득예측/open/open/sample_submission.csv')
sample_submission['Income'] = y_test_pred

In [94]:
sample_submission.to_csv('C:/Users/82106/Desktop/데이터분석 프로젝트/소득예측/open/open/sample_submission4.csv', index=False)