### 서울시 모기 발생상황 지표 예측
- 2016년~ 2019년까지의 일별 모기지수 데이터를 온도,강수량 데이터를 통해 예측해본다.


데이터 출처
- https://data.kma.go.kr/stcs/grnd/grndTaList.do?pgmNo=70 (기상청)
- https://data.seoul.go.kr/dataList/16/literacyView.do (서울공공데이터포털)

In [4]:
# 데이터 불러오기
import pandas as pd
train_x =pd.read_csv('mosquitodata/train_x.csv',encoding='euc-kr')
train_y =pd.read_csv('mosquitodata/train_y.csv',encoding='euc-kr')
test_x =pd.read_csv('mosquitodata/test_x.csv',encoding='euc-kr')
sub    =pd.read_csv('mosquitodata/sub.csv')

In [5]:
# train과 test 합쳐서 전처리
X = pd.concat([train_x,test_x])
X

Unnamed: 0,date,강수량(mm),평균기온(℃),최저기온(℃),최고기온(℃)
0,2019-12-31,0.0,-7.9,-10.9,-4.5
1,2019-12-30,0.4,2.7,-5.7,6.8
2,2019-12-29,1.4,3.8,1.1,6.2
3,2019-12-27,0.0,-1.7,-4.6,2.6
4,2019-12-25,0.0,2.0,-2.7,6.6
...,...,...,...,...,...
295,2019-12-14,0.0,2.7,-0.1,6.4
296,2019-12-19,0.0,-0.8,-4.5,3.1
297,2019-12-23,2.9,3.4,0.3,5.2
298,2019-12-26,0.3,2.0,-2.4,4.1


In [65]:
# 전처리 함수 생성
# date를 datetime형식으로 변환후 년도, 월, 일, 주 생성
def preprocessing(df):
    df["date"] = pd.to_datetime(df["date"])
    df["year"] = df['date'].dt.year
    df["month"] = df['date'].dt.month
    df["day"] = df['date'].dt.day
    df["weekday"] = df['date'].dt.weekday
    return df

In [66]:
X = preprocessing(X)

In [67]:
# 기존의 date 컬럼 삭제
new_X = X.drop('date',axis=1)
new_X.head(3)

Unnamed: 0,강수량(mm),평균기온(℃),최저기온(℃),최고기온(℃),year,month,day,weekday
0,0.0,-7.9,-10.9,-4.5,2019,12,31,1
1,0.4,2.7,-5.7,6.8,2019,12,30,0
2,1.4,3.8,1.1,6.2,2019,12,29,6


In [68]:
# train_x와 train_y 조인
new_XX = pd.concat([new_X[:1016],train_y['mosquito_ratio']],axis=1,join="inner")
new_XX

Unnamed: 0,강수량(mm),평균기온(℃),최저기온(℃),최고기온(℃),year,month,day,weekday,mosquito_ratio
0,0.0,-7.9,-10.9,-4.5,2019,12,31,1,5.5
1,0.4,2.7,-5.7,6.8,2019,12,30,0,5.5
2,1.4,3.8,1.1,6.2,2019,12,29,6,5.5
3,0.0,-1.7,-4.6,2.6,2019,12,27,4,5.5
4,0.0,2.0,-2.7,6.6,2019,12,25,2,5.5
...,...,...,...,...,...,...,...,...,...
1011,9.5,16.3,13.0,18.6,2016,5,6,4,248.5
1012,7.5,18.9,10.2,26.9,2016,5,5,3,243.8
1013,0.0,15.7,10.2,20.6,2016,5,4,2,256.2
1014,27.0,12.9,8.9,17.6,2016,5,3,1,304.0


In [69]:
# 월별 모기 분포
new_XX[['month','mosquito_ratio']].groupby(['month']).mosquito_ratio.agg('mean')

month
1       4.097059
2       4.123188
3       6.185526
4      24.975758
5     262.114286
6     617.132222
7     778.635052
8     291.417021
9     536.848864
10    224.647500
11      7.953086
12      4.846078
Name: mosquito_ratio, dtype: float64

In [70]:
# 월별 모기 수에 따른 라벨 인코딩
weather = {
    1 : '0',
    2 : '0',
    3 : '0',
    4 : '1',
    5 : '1',
    6 : '2',
    7 : '2',
    8 : '1',
    9 : '2',
    10 : '1',
    11 : '0',
    12 : '0',
}

In [71]:
new_X['weather'] = new_X["month"].map(weather)

In [73]:
new_X = pd.get_dummies(new_X,drop_first=True)
new_X

Unnamed: 0,강수량(mm),평균기온(℃),최저기온(℃),최고기온(℃),year,month,day,weekday,weather_1,weather_2
0,0.0,-7.9,-10.9,-4.5,2019,12,31,1,0,0
1,0.4,2.7,-5.7,6.8,2019,12,30,0,0,0
2,1.4,3.8,1.1,6.2,2019,12,29,6,0,0
3,0.0,-1.7,-4.6,2.6,2019,12,27,4,0,0
4,0.0,2.0,-2.7,6.6,2019,12,25,2,0,0
...,...,...,...,...,...,...,...,...,...,...
295,0.0,2.7,-0.1,6.4,2019,12,14,5,0,0
296,0.0,-0.8,-4.5,3.1,2019,12,19,3,0,0
297,2.9,3.4,0.3,5.2,2019,12,23,0,0,0
298,0.3,2.0,-2.4,4.1,2019,12,26,3,0,0


In [74]:
# train, test 분리
from sklearn.model_selection import train_test_split

X_train, X_test, y_train,y_test = train_test_split(new_X[:1016],train_y["mosquito_ratio"],test_size=0.3,
                                                                      random_state=42)

In [75]:
# min-max 표준화
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

train_scaled = scaler.fit_transform(X_train)
test_scaled = scaler.transform(X_test)
ttest_scaled = scaler.transform(new_X[1016:])

In [76]:
# 기본 랜덤포레스트 모델 적합
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42)
rf.fit(train_scaled,y_train)
y_pred = rf.predict(test_scaled)

In [77]:
# r2_score를 통한 검증
from sklearn.metrics import r2_score

score = r2_score(y_test,y_pred)
score

0.9203678649210191

In [78]:
rf_pred = rf.predict(ttest_scaled)

In [79]:
sub['mosquito_ratio'] = rf_pred
sub.to_csv('submission.csv',index=False)

In [80]:
# mse를 통한 최종 테스트 결과 비교
def FinalMseScore():
    import pandas as pd
    y_true = pd.read_csv("https://raw.githubusercontent.com/Datamanim/mosquito/main/result.csv")
    sub = pd.read_csv('./submission.csv')
    pred = sub.iloc[:,-1].values
    from sklearn.metrics import r2_score
    mse = r2_score(y_true['mosquito_ratio'],pred)    
    print('submission mse score : ',mse)
    return mse

final_mse = FinalMseScore()

submission mse score :  0.905619519028415


In [22]:
# 기본 xgboost모델 적합
import xgboost
xgb_reg = xgboost.XGBRFRegressor(random_state=42)
xgb_reg.fit(train_scaled,y_train)
y_pred = xgb_reg.predict(test_scaled)

In [23]:
score = r2_score(y_test,y_pred)
score

0.9116651709041649

In [24]:
xgb_pred = xgb_reg.predict(ttest_scaled)

In [25]:
sub['mosquito_ratio'] = xgb_pred
sub.to_csv('submission.csv',index=False)

In [26]:
# xgboost 최종 테스트 비교
def FinalMseScore():
    import pandas as pd
    y_true = pd.read_csv("mosquitodata/result.csv")
    sub = pd.read_csv('./submission.csv')
    pred = sub.iloc[:,-1].values
    from sklearn.metrics import r2_score
    mse = r2_score(y_true['mosquito_ratio'],pred)    
    print('submission mse score : ',mse)
    return mse

final_mse = FinalMseScore()

submission mse score :  0.901446372725677
