In [None]:
# ライブラリのインポート
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor


# 警告文を非表示に
import warnings
warnings.filterwarnings('ignore')
#カラム内の文字数。デフォルトは50だった
pd.set_option("display.max_colwidth", 100)
#行数
pd.set_option("display.max_rows", 500)
#列数
pd.set_option("display.max_columns", 500)

In [None]:
# データのインポート
Data2013 = pd.read_excel("Data2013.xlsx")
Data2016 = pd.read_excel("Data2016.xlsx")
Data2019 = pd.read_excel("Data2019.xlsx")

# 2013年　前処理

## 新たな変数を追加する

### 情勢報道（朝日・読売・日経）

In [None]:
news2013 = pd.read_excel("news2013.xlsx")
news2013 = news2013[["candidate_J", "3news_avg", "asahi", "nikkei", "yomiuri"]]
Data2013 = pd.merge(Data2013, news2013, on='candidate_J', how='left')

### scandal ダミー

In [None]:
# 現職候補者に関しては、任期中に不祥事を起こしたら1、起こしてなかったら0を入れる
# wikipediaで確認する

scandal2013 = pd.read_excel("scandal2013.xlsx")
scandal2013.columns

Data2013 = pd.merge(Data2013, scandal2013, on='candidate_J', how='left')
Data2013 = Data2013.drop(["district_y", "incumbent_y"], axis=1)
Data2013 = Data2013.rename(columns={"district_x": "district", "incumbent_x": "incumbent"})

### 所属政党＊地方議会所属政党ダミー

In [None]:
# 地方議会の政党議席率を選挙区ごとに整理したデータセットを用意する
# http://www.soumu.go.jp/senkyo/senkyo_s/data/syozoku/ichiran.html
local2013 = pd.read_excel("local2013.xlsx")
local2013.columns

# カラム名を変える
local2013 = local2013.rename(columns={"district_code": "district_code", "district": "district", 
                             "ldp": "local_ldp", "ldp_rate": "local_ldp_rate", 
                             "dpj": "local_dpj", "dpj_rate": "local_dpj_rate",
                             "kyosan": "local_jcp", "kyosan_rate": "local_jcp_rate",
                             "komei": "local_kom", "komei_rate": "local_kom_rate",
                             "shamin": "local_syamin", "shamin_rate": "local_syamin_rate",
                             "ishin": "local_ishin", "ishin_rate": "local_ishin_rate"        })

# 使うカラムを選ぶ
local2013 = local2013[["district", "local_ldp_rate", "local_dpj_rate", "local_jcp_rate", "local_kom_rate",
                       "local_syamin_rate", "local_ishin_rate"]]

# データセットをmergeする
Data2013 = pd.merge(Data2013, local2013, on='district', how='left')

# 選挙区の政党議席率と候補者の所属政党を掛け合わせる
# イメージとしては、選挙区で優勢な政党であるほどこのカラムの値は大きくなる
# 例：候補者の所属政党が自民党の場合、当選に有利なはずだが選挙区で自民党が優勢かどうかでその有利さは変わるはず
Data2013["ldp_local_rate"] = Data2013["ldp"]*Data2013["local_ldp_rate"]
Data2013["dpj_local_rate"] = Data2013["dpj"]*Data2013["local_dpj_rate"]
Data2013["jcp_local_rate"] = Data2013["jcp"]*Data2013["local_jcp_rate"]
Data2013["kom_local_rate"] = Data2013["kom"]*Data2013["local_kom_rate"]
Data2013["syamin_local_rate"] = Data2013["syamin"]*Data2013["local_syamin_rate"]
Data2013["ishin_local_rate"] = Data2013["ishin"]*Data2013["local_ishin_rate"]
Data2013["party_local_advantage"] = Data2013["ldp_local_rate"]+Data2013["dpj_local_rate"]+Data2013["jcp_local_rate"]+Data2013["kom_local_rate"]+Data2013["syamin_local_rate"]+Data2013["ishin_local_rate"] 

### 所属政党＊衆院選所属政党ダミー

In [None]:
HR2013 = pd.read_excel("HR_12_VS.xlsx", index=False)
HR2013.columns

# カラム名を変える
HR2013 = HR2013.rename(columns={ 
                             "ldp_vs": "HR_ldp_rate", "dpj_vs": "HR_dpj_rate", 
                             "komei_vs": "HR_kom_rate", "jcp_vs": "HR_jcp_rate",
                             "sdp_vs": "HR_syamin_rate", "ishin_vs": "HR_ishin_rate"})

# 使うカラムを選ぶ
HR2013 = HR2013[["district", "HR_ldp_rate", "HR_dpj_rate", "HR_kom_rate", 
                 "HR_jcp_rate", "HR_syamin_rate", "HR_ishin_rate"]]

# データセットをmergeする
Data2013 = pd.merge(Data2013, HR2013, on='district', how='left')

# 選挙区の政党議席率と候補者の所属政党を掛け合わせる
# イメージとしては、選挙区で優勢な政党であるほどこのカラムの値は大きくなる
# 例：候補者の所属政党が自民党の場合、当選に有利なはずだが選挙区で自民党が優勢かどうかでその有利さは変わるはず
Data2013["ldp_HR_rate"] = Data2013["ldp"]*Data2013["HR_ldp_rate"]
Data2013["dpj_HR_rate"] = Data2013["dpj"]*Data2013["HR_dpj_rate"]
Data2013["jcp_HR_rate"] = Data2013["jcp"]*Data2013["HR_jcp_rate"]
Data2013["kom_HR_rate"] = Data2013["kom"]*Data2013["HR_kom_rate"]
Data2013["syamin_HR_rate"] = Data2013["syamin"]*Data2013["HR_syamin_rate"]
Data2013["ishin_HR_rate"] = Data2013["ishin"]*Data2013["HR_ishin_rate"]
Data2013["party_HR_advantage"] = Data2013["ldp_HR_rate"]+Data2013["dpj_HR_rate"]+Data2013["jcp_HR_rate"]+Data2013["kom_HR_rate"]+Data2013["syamin_HR_rate"]+Data2013["ishin_HR_rate"] 


### 所属政党＊世論調査所属政党ダミー

### 与党ダミー

In [None]:
Data2013["ruling"] = 1
Data2013["ruling"] = Data2013["ruling"].where((Data2013["ldp"] ==1)|(Data2013["kom"]==1), 0)

### 与党＊地方議会与党比率ダミー

In [None]:
Data2013["local_ruling_rate"] = Data2013["local_ldp_rate"] + Data2013["local_kom_rate"]
Data2013["ruling_local_rate"] = Data2013["ruling"]*Data2013["local_ruling_rate"]*0.01

### 与党＊沖縄ダミー

In [None]:
Data2013["okinawa"] = 1
Data2013["okinawa"] = Data2013["okinawa"].where((Data2013["district"] =="沖縄"), 0)
Data2013["ruling_okinawa"] = Data2013["ruling"]*Data2013["okinawa"]

### 与党＊天気ダミー

In [None]:
weather = pd.read_excel("weather.xlsx")
weather =pd.DataFrame(weather)
weather2013 = weather.T[["district", 2013]]
weather2013 = weather2013.rename(columns={2013: "weather"})

# データセットをmergeする
Data2013 = pd.merge(Data2013, weather2013, on='district', how='left')

Data2013["ruling_weather"] = Data2013["ruling"]*Data2013["weather"]

### 世襲ダミー

In [None]:
hereditary2013 = pd.read_excel("hereditary2013.xlsx")
hereditary2013 = hereditary2013.fillna(0)
hereditary2013 = hereditary2013[["candidate_J", "hereditary"]]
Data2013 = pd.merge(Data2013, hereditary2013, on='candidate_J', how='left')

In [None]:
Data2013.to_pickle("Data2013.pickle")

# 2016年　前処理

## 新たな変数を追加する

### 情勢報道（朝日・読売・日経）

In [None]:
news2016 = pd.read_excel("news2016.xlsx")
news2016 = news2016[["candidate_J", "3news_avg", "asahi", "nikkei", "yomiuri"]]
Data2016 = pd.merge(Data2016, news2016, on='candidate_J', how='left')

### scandal ダミー

In [None]:
# 現職候補者に関しては、任期中に不祥事を起こしたら1、起こしてなかったら0を入れる
# wikipediaで確認する

scandal2016 = pd.read_excel("scandal2016.xlsx")
scandal2016.columns

Data2016 = pd.merge(Data2016, scandal2016, on='candidate_J', how='left')
Data2016 = Data2016.drop(["district_y", "incumbent_y"], axis=1)
Data2016 = Data2016.rename(columns={"district_x": "district", "incumbent_x": "incumbent"})

### 所属政党＊地方議会所属政党ダミー

In [None]:
# 地方議会の政党議席率を選挙区ごとに整理したデータセットを用意する
# http://www.soumu.go.jp/senkyo/senkyo_s/data/syozoku/ichiran.html
local2016 = pd.read_excel("local2016.xlsx")
local2016.columns

# カラム名を変える
local2016 = local2016.rename(columns={"district_code": "district_code", "district": "district", 
                             "ldp": "local_ldp", "ldp_rate": "local_ldp_rate", 
                             "dpj": "local_dpj", "dpj_rate": "local_dpj_rate",
                             "kyosan": "local_jcp", "kyosan_rate": "local_jcp_rate",
                             "komei": "local_kom", "komei_rate": "local_kom_rate",
                             "shamin": "local_syamin", "shamin_rate": "local_syamin_rate",
                             "ishin": "local_ishin", "ishin_rate": "local_ishin_rate"        })

# 使うカラムを選ぶ
local2016 = local2016[["district", "local_ldp_rate", "local_dpj_rate", "local_jcp_rate", "local_kom_rate",
                       "local_syamin_rate", "local_ishin_rate"]]

# データセットをmergeする
Data2016 = pd.merge(Data2016, local2016, on='district', how='left')

# 選挙区の政党議席率と候補者の所属政党を掛け合わせる
# イメージとしては、選挙区で優勢な政党であるほどこのカラムの値は大きくなる
# 例：候補者の所属政党が自民党の場合、当選に有利なはずだが選挙区で自民党が優勢かどうかでその有利さは変わるはず
Data2016["ldp_local_rate"] = Data2016["ldp"]*Data2016["local_ldp_rate"]
Data2016["dpj_local_rate"] = Data2016["dpj"]*Data2016["local_dpj_rate"]
Data2016["jcp_local_rate"] = Data2016["jcp"]*Data2016["local_jcp_rate"]
Data2016["kom_local_rate"] = Data2016["kom"]*Data2016["local_kom_rate"]
Data2016["syamin_local_rate"] = Data2016["syamin"]*Data2016["local_syamin_rate"]
Data2016["ishin_local_rate"] = Data2016["ishin"]*Data2016["local_ishin_rate"]
Data2016["party_local_advantage"] = Data2016["ldp_local_rate"]+Data2016["dpj_local_rate"]+Data2016["jcp_local_rate"]+Data2016["kom_local_rate"]+Data2016["syamin_local_rate"]+Data2016["ishin_local_rate"] 

### 所属政党＊衆院選所属政党ダミー

In [None]:
HR2016 = pd.read_excel("HR_14_VS.xlsx", index=False)
HR2016.columns

# カラム名を変える
HR2016 = HR2016.rename(columns={ 
                             "ldp_vs": "HR_ldp_rate", "dpj_vs": "HR_dpj_rate", 
                             "komei_vs": "HR_kom_rate", "jcp_vs": "HR_jcp_rate",
                             "sdp_vs": "HR_syamin_rate", "ishin_vs": "HR_ishin_rate"})

# 使うカラムを選ぶ
HR2016 = HR2016[["district", "HR_ldp_rate", "HR_dpj_rate", "HR_kom_rate", 
                 "HR_jcp_rate", "HR_syamin_rate", "HR_ishin_rate"]]

# データセットをmergeする
Data2016 = pd.merge(Data2016, HR2016, on='district', how='left')

# 選挙区の政党議席率と候補者の所属政党を掛け合わせる
# イメージとしては、選挙区で優勢な政党であるほどこのカラムの値は大きくなる
# 例：候補者の所属政党が自民党の場合、当選に有利なはずだが選挙区で自民党が優勢かどうかでその有利さは変わるはず
Data2016["ldp_HR_rate"] = Data2016["ldp"]*Data2016["HR_ldp_rate"]
Data2016["dpj_HR_rate"] = Data2016["dpj"]*Data2016["HR_dpj_rate"]
Data2016["jcp_HR_rate"] = Data2016["jcp"]*Data2016["HR_jcp_rate"]
Data2016["kom_HR_rate"] = Data2016["kom"]*Data2016["HR_kom_rate"]
Data2016["syamin_HR_rate"] = Data2016["syamin"]*Data2016["HR_syamin_rate"]
Data2016["ishin_HR_rate"] = Data2016["ishin"]*Data2016["HR_ishin_rate"]
Data2016["party_HR_advantage"] = Data2016["ldp_HR_rate"]+Data2016["dpj_HR_rate"]+Data2016["jcp_HR_rate"]+Data2016["kom_HR_rate"]+Data2016["syamin_HR_rate"]+Data2016["ishin_HR_rate"] 


### 所属政党＊世論調査所属政党ダミー

### 与党ダミー

In [None]:
Data2016["ruling"] = 1
Data2016["ruling"] = Data2016["ruling"].where((Data2016["ldp"] ==1)|(Data2016["kom"]==1), 0)

### 与党＊地方議会与党比率ダミー

In [None]:
Data2016["local_ruling_rate"] = Data2016["local_ldp_rate"] + Data2016["local_kom_rate"]
Data2016["ruling_local_rate"] = Data2016["ruling"]*Data2016["local_ruling_rate"]*0.01

### 与党＊衆議院与党比率ダミー

In [None]:
HR2016 = pd.read_excel("HR_14_VS.xlsx", index=False)
HR2016 = HR2016.rename({"komei_vs": "kom_vs"}, axis=1)
HR2016["ruling_vs"] = HR2016["ldp_vs"] + HR2016["kom_vs"]
HR2016 = HR2016[["district", "ruling_vs"]]

# データセットをmergeする
Data2016 = pd.merge(Data2016, HR2016, on='district', how='left')

Data2016["ruling_HR_vs"] = Data2016["ruling"]*Data2016["ruling_vs"]*0.01

### 与党＊沖縄ダミー

In [None]:
Data2016["okinawa"] = 1
Data2016["okinawa"] = Data2016["okinawa"].where((Data2016["district"] =="沖縄"), 0)
Data2016["ruling_okinawa"] = Data2016["ruling"]*Data2016["okinawa"]

### 世襲ダミー

In [None]:
hereditary2016 = pd.read_excel("hereditary2016.xlsx")
hereditary2016 = hereditary2016.fillna(0)
hereditary2016 = hereditary2016[["candidate_J", "hereditary"]]
Data2016 = pd.merge(Data2016, hereditary2016, on='candidate_J', how='left')

### 与党＊天気ダミー

In [None]:
weather = pd.read_excel("weather.xlsx")
weather =pd.DataFrame(weather)
weather2016 = weather.T[["district", 2016]]
weather2016 = weather2016.rename(columns={2016: "weather"})

# データセットをmergeする
Data2016 = pd.merge(Data2016, weather2016, on='district', how='left')

Data2016["ruling_weather"] = Data2016["ruling"]*Data2016["weather"]

In [None]:
Data2016

In [None]:
Data2016.to_pickle("Data2016.pickle")

# 2019年　前処理

## 新たな変数を追加する

### 情勢報道（朝日・読売・日経）

In [None]:
news2019 = pd.read_excel("news2019.xlsx")
news2019 = news2019[["candidate_J", "3news_avg", "asahi", "nikkei", "yomiuri"]]
Data2019 = pd.merge(Data2019, news2019, on='candidate_J', how='left')

### femaleダミー

In [None]:
female2019 = pd.read_excel("female2019.xlsx")
Data2019 = pd.merge(Data2019, female2019, on='candidate_J', how='left')

In [None]:
Data2019

### スキャンダルダミー

In [None]:
# 現職候補者に関しては、任期中に不祥事を起こしたら1、起こしてなかったら0を入れる
# wikipediaで確認する

scandal2019 = pd.read_excel("scandal2019.xlsx")
scandal2019.columns

Data2019 = pd.merge(Data2019, scandal2019, on='candidate_J', how='left')
Data2019 = Data2019.drop(["district_y", "incumbent_y"], axis=1)
Data2019 = Data2019.rename(columns={"district_x": "district", "incumbent_x": "incumbent"})

### 所属政党＊地方議会所属政党ダミー

In [None]:
# 地方議会の政党議席率を選挙区ごとに整理したデータセットを用意する
# http://www.soumu.go.jp/senkyo/senkyo_s/data/syozoku/ichiran.html
local2019 = pd.read_excel("local2019.xlsx")
local2019.columns

# カラム名を変える
local2019 = local2019.rename(columns={"district_code": "district_code", "district": "district", 
                             "ldp": "local_ldp", "ldp_rate": "local_ldp_rate", 
                             "rikken": "local_dpj", "rikken_rate": "local_dpj_rate",
                             "kyosan": "local_jcp", "kyosan_rate": "local_jcp_rate",
                             "komei": "local_kom", "komei_rate": "local_kom_rate",
                             "shamin": "local_syamin", "shamin_rate": "local_syamin_rate",
                             "nihon_ishin": "local_ishin", "nihon_ishin_rate": "local_ishin_rate"        })

# 使うカラムを選ぶ
local2019 = local2019[["district", "local_ldp_rate", "local_dpj_rate", "local_jcp_rate", "local_kom_rate",
                       "local_syamin_rate", "local_ishin_rate"]]

# データセットをmergeする
Data2019 = pd.merge(Data2019, local2019, on='district', how='left')

# 選挙区の政党議席率と候補者の所属政党を掛け合わせる
# イメージとしては、選挙区で優勢な政党であるほどこのカラムの値は大きくなる
# 例：候補者の所属政党が自民党の場合、当選に有利なはずだが選挙区で自民党が優勢かどうかでその有利さは変わるはず
Data2019["ldp_local_rate"] = Data2019["ldp"]*Data2019["local_ldp_rate"]
Data2019["dpj_local_rate"] = Data2019["dpj"]*Data2019["local_dpj_rate"]
Data2019["jcp_local_rate"] = Data2019["jcp"]*Data2019["local_jcp_rate"]
Data2019["kom_local_rate"] = Data2019["kom"]*Data2019["local_kom_rate"]
Data2019["syamin_local_rate"] = Data2019["syamin"]*Data2019["local_syamin_rate"]
Data2019["ishin_local_rate"] = Data2019["ishin"]*Data2019["local_ishin_rate"]
Data2019["party_local_advantage"] = Data2019["ldp_local_rate"]+Data2019["dpj_local_rate"]+Data2019["jcp_local_rate"]+Data2019["kom_local_rate"]+Data2019["syamin_local_rate"]+Data2019["ishin_local_rate"] 

### 所属政党＊衆院選所属政党ダミー

In [None]:
HR2019 = pd.read_excel("HR_17_VS.xlsx", index=False)
HR2019.columns

# カラム名を変える
HR2019 = HR2019.rename(columns={ 
                             "ldp_vs": "HR_ldp_rate", "cdp_vs": "HR_dpj_rate", 
                             "komei_vs": "HR_kom_rate", "jcp_vs": "HR_jcp_rate",
                             "sdp_vs": "HR_syamin_rate", "ishin_vs": "HR_ishin_rate"})

# 使うカラムを選ぶ
HR2019 = HR2019[["district", "HR_ldp_rate", "HR_dpj_rate", "HR_kom_rate", 
                 "HR_jcp_rate", "HR_syamin_rate", "HR_ishin_rate"]]

# データセットをmergeする
Data2019 = pd.merge(Data2019, HR2019, on='district', how='left')

# 選挙区の政党議席率と候補者の所属政党を掛け合わせる
# イメージとしては、選挙区で優勢な政党であるほどこのカラムの値は大きくなる
# 例：候補者の所属政党が自民党の場合、当選に有利なはずだが選挙区で自民党が優勢かどうかでその有利さは変わるはず
Data2019["ldp_HR_rate"] = Data2019["ldp"]*Data2019["HR_ldp_rate"]
Data2019["dpj_HR_rate"] = Data2019["dpj"]*Data2019["HR_dpj_rate"]
Data2019["jcp_HR_rate"] = Data2019["jcp"]*Data2019["HR_jcp_rate"]
Data2019["kom_HR_rate"] = Data2019["kom"]*Data2019["HR_kom_rate"]
Data2019["syamin_HR_rate"] = Data2019["syamin"]*Data2019["HR_syamin_rate"]
Data2019["ishin_HR_rate"] = Data2019["ishin"]*Data2019["HR_ishin_rate"]
Data2019["party_HR_advantage"] = Data2019["ldp_HR_rate"]+Data2019["dpj_HR_rate"]+Data2019["jcp_HR_rate"]+Data2019["kom_HR_rate"]+Data2019["syamin_HR_rate"]+Data2019["ishin_HR_rate"] 


### 所属政党＊世論調査所属政党ダミー

### 与党ダミー

In [None]:
Data2019["ruling"] = 1
Data2019["ruling"] = Data2019["ruling"].where((Data2019["ldp"] ==1)|(Data2019["kom"]==1), 0)

### 与党＊地方議会与党比率ダミー

In [None]:
Data2019["local_ruling_rate"] = Data2019["local_ldp_rate"] + Data2019["local_kom_rate"]
Data2019["ruling_local_rate"] = Data2019["ruling"]*Data2019["local_ruling_rate"]*0.01

### 与党＊衆議院与党比率ダミー

In [None]:
HR2019 = pd.read_excel("HR_17_VS.xlsx", index=False)
HR2019 = HR2019.rename({"komei_vs": "kom_vs"}, axis=1)
HR2019["ruling_vs"] = HR2019["ldp_vs"] + HR2019["kom_vs"]
HR2019 = HR2019[["district", "ruling_vs"]]

# データセットをmergeする
Data2019 = pd.merge(Data2019, HR2019, on='district', how='left')

Data2019["ruling_HR_vs"] = Data2019["ruling"]*Data2019["ruling_vs"]*0.01

### 与党＊沖縄ダミー

In [None]:
Data2019["okinawa"] = 1
Data2019["okinawa"] = Data2019["okinawa"].where((Data2019["district"] =="沖縄"), 0)
Data2019["ruling_okinawa"] = Data2019["ruling"]*Data2019["okinawa"]

### 世襲ダミー

In [None]:
hereditary2019 = pd.read_excel("hereditary2019.xlsx")
hereditary2019 = hereditary2019.fillna(0)
hereditary2019 = hereditary2019[["candidate_J", "hereditary"]]
Data2019 = pd.merge(Data2019, hereditary2019, on='candidate_J', how='left')

### 天気ダミー

In [None]:
Data2019.to_pickle("Data2019.pickle")

# 2013・2016年　モデル生成

In [None]:
# ライブラリのインポート
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor


# 警告文を非表示に
import warnings
warnings.filterwarnings('ignore')
#カラム内の文字数。デフォルトは50だった
pd.set_option("display.max_colwidth", 100)
#行数
pd.set_option("display.max_rows", 500)
#列数
pd.set_option("display.max_columns", 500)

In [None]:
# データセットのインポート

Data2013 = pd.read_pickle("Data2013.pickle")
Data2016 = pd.read_pickle("Data2016.pickle")

Data2013["year"] = 2013
Data2016["year"] = 2016
Data_train = pd.concat([Data2013, Data2016])

In [None]:
df = Data_train[["wl", "elected_count", "incumbent", "3news_avg", "female", "hereditary",
                 "party_local_advantage", "party_HR_advantage", "age", "newcomer"]]

In [None]:
# 乱数の種を設定
np.random.seed(0) # 訓練データとテストデータに分ける際に, データの再現性を得られるようにする

# 特徴量と目的変数に分ける
X = df.drop(labels=["wl"], axis = 1)
y = df["wl"]

# 特徴量名を取り出しておく
feature_x = X.columns
feature_y = y.name

# 学習データと評価データに分ける
X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.2, random_state=0)

print("shape of X and y for training: ", X_train.shape, y_train.shape)
print("shape of X and y for teswlng: ", X_test.shape, y_test.shape)

In [None]:
# y_train.sum()/y_train.count()

In [None]:
# y_test.sum()/y_test.count()

In [None]:
# #Grid_Searchする
# #必要なライブラリのインポート
# from sklearn.model_selection import GridSearchCV

# params = {
#        'n_estimators'      : [10,25,50,75,100, 500],
#        'random_state'      : [0],
#       # 'n_jobs'            : [-1],
#       # 'min_samples_split' : [2,5,10, 15, 20,25, 30],
#        'max_depth'         : [5, 10, 15,20,25,30],
#        'max_leaf_nodes'    : [3, 5, 10, 16, 20]
# }
# GBC_grid = GridSearchCV(estimator=GradientBoostingClassifier(random_state=0), param_grid=params, \
#                        scoring="f1", cv = 10)
# GBC_grid.fit(X_train, y_train.ravel())
# print(GBC_grid.best_estimator_)

# y_pred = GBC_grid.predict(X_test)
# print(accuracy_score(y_test, y_pred))

# # recallもためしてみます

In [None]:
#print(clf_grid.score(X_train, y_train))
#print(clf_grid.score(X_test, y_test))

In [None]:
  # ランダムフォレストによる学習
clf = GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=16,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False)
clf.fit(X_train, y_train)


In [None]:
# 変数重要度
pd.Series(data=clf.feature_importances_, 
                 index=feature_x).sort_values(ascending=False)

In [None]:
# 予測
mat_prob = clf.predict_proba(X_test) # 予測確率
y_pred = clf.predict(X_test) # 予測クラス

In [None]:
# 予測確率
mat_prob

In [None]:
# 正解率, F1の確認
acc = accuracy_score(y_test, y_pred)
f = f1_score(y_test, y_pred)

print("accuracy:", acc)
print("f1:", f)

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d')

# サンプリングをしなおして不均衡を薄める
# ランダムフォレスト
# class_weightを入れて不均衡を薄める
# y_predとy_testを見比べて特定して原因を考える

In [None]:
# Cross Validation in X_train

from sklearn.model_selection import cross_val_score
# 交差検証
scores = cross_val_score(clf, X_train, y_train, cv=10)
# 各分割におけるスコア
print('Cross-Validation scores: {}'.format(scores))
# スコアの平均値
import numpy as np
print('Average score: {}'.format(np.mean(scores)))

In [None]:
# Cross Validation in X_test

from sklearn.model_selection import cross_val_score
# rf_clf = GradientBoostingClassifier(bootstrap=True, class_weight=None, criterion='gini',
#             max_depth=10, max_features='auto', max_leaf_nodes=20,
#             min_impurity_decrease=0.0, min_impurity_split=None,
#             min_samples_leaf=1, min_samples_split=2,
#             min_weight_fraction_leaf=0.0, n_estimators=75, n_jobs=1,
#             oob_score=False, random_state=0, verbose=0, warm_start=False)

# 交差検証
scores = cross_val_score(clf, X_test, y_test, cv=10)
# 各分割におけるスコア
print('Cross-Validation scores: {}'.format(scores))
# スコアの平均値
import numpy as np
print('Average score: {}'.format(np.mean(scores)))

In [None]:
# 回帰分析で有意を確認する。

#　変数を選ぶ
df = Data_train[["wl", "elected_count", "incumbent", "3news_avg", "hereditary", 
                 "party_local_advantage", "party_HR_advantage", "age", "female", "newcomer"]]

# 見てみる
df.info()

# 説明変数と被説明変数を設定する
y = df["wl"]
X = df.drop("wl", axis=1)

# ライブラリのインポート
import statsmodels.formula.api as sm

# モデル作成
model = sm.Logit(y, X)
result = model.fit()

# 分析結果を見る
result.summary()

In [None]:
Data2013[["wl", "asahi", "nikkei", "yomiuri", "3news_avg"]].corr()

# 2019年　予測の実行

In [None]:
# データのインポート
Data2019 = pd.read_pickle("Data2019.pickle")

In [None]:
Data_test = Data2019[["elected_count", "incumbent", "3news_avg", "female", "hereditary",
                 "party_local_advantage", "party_HR_advantage", "age", "newcomer"]]

In [None]:
df2_X = Data_test

In [None]:
Data2019

In [None]:
# ランダムフォレストによる予測
rf_prob = clf.predict_proba(df2_X) # 予測確率

In [None]:
# ランダムフォレストの予測確率
rf_prob

In [None]:
rf_prob = pd.DataFrame(rf_prob)
rf_prob = rf_prob.rename({0: "lose_rate", 1: "win_rate"}, axis=1)

In [None]:
# 横に連結する
Prediction = pd.concat([Data2019, rf_prob], axis=1)

# 提出用ファイルの作成

In [None]:
Prediction["win_rate"].value_counts()

In [None]:
prefecture  = Prediction['district'].unique()
prefecture

In [None]:
Prediction.info()

In [None]:
Prediction.info()

In [None]:
# Prediction = Prediction.assign(outcome = 0)

# for i in prefecture:
#     temp =  Prediction[Prediction.district ==  i]
#     temp = temp.sort_values(by='win_rate', ascending=False)
#     magnitude_of_temp = temp.iloc[1,[1]].values
            
#     count = 0
#     for index, row in temp.iterrows():
#         num = temp['num'][index] 
#         if count < magnitude_of_temp[0]:
#             Prediction['outcome'].where(Prediction['num'] != num, 1, inplace=True)
#         count = count +1
    
# Prediction = Prediction.sort_values("num")

In [None]:
Prediction = Prediction.assign(outcome = 0)

for i in prefecture:
    temp =  Prediction[Prediction.district ==  i]
    temp = temp.sort_values(by='win_rate', ascending=False)
    magnitude_of_temp = temp["magnitude"]
    magnitude_of_temp =  magnitude_of_temp.iloc[1]
            
    count = 0
    for index, row in temp.iterrows():
        num = temp['num'][index] 
        if count < magnitude_of_temp:
            Prediction['outcome'].where(Prediction['num'] != num, 1, inplace=True)
        count = count +1
Prediction = Prediction.sort_values("num")

In [None]:
Prediction_submit = Prediction[["num", "outcome"]]

In [None]:
submit = pd.read_csv("candidates_default.csv")
submit["outcome"] = Prediction_submit["outcome"]

In [None]:
# 74になっているべき
submit["outcome"].sum()

In [None]:
submit.to_csv("candidates_gb.csv", index=False)

http://dswaseda.appspot.com/login

In [None]:
#Data_train["elected_count"].value_counts()/2

In [None]:
#Data_test["elected_count"].value_counts()

In [None]:
#Data2016[(Data2016.elected_count>0)&(Data2016.status=="新")]

In [None]:
#Data2013.status.value_counts()

In [None]:
#Data2016.status.value_counts()

In [None]:
#Data2019.status.value_counts()

In [None]:
#Data2013[(Data2013.wl==1)]["elected_count"].value_counts()

In [None]:
# Data2016[(Data2016.incumbent==0)&(Data2016.wl==1)]

In [None]:
# 
# https://togetter.com/li/997652
# https://note.mu/miraisyakai/n/n3817778b5092