# 候補者選挙区の前処理を行う

## ライブラリのインポート

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

# 警告文を非表示に
import warnings
warnings.filterwarnings('ignore')
#カラム内の文字数。デフォルトは50だった
pd.set_option("display.max_colwidth", 100)
#行数
pd.set_option("display.max_rows", 100)

#カラム内の文字数。デフォルトは50だった
pd.set_option("display.max_colwidth", 100)
#行数
pd.set_option("display.max_rows", 100)

## データのインポート

In [2]:
kouhosha_senkyoku_2016 = pd.read_excel("kouhosha_senkyoku_2016.xlsx")

## データの確認

In [3]:
kouhosha_senkyoku_2016.head()

Unnamed: 0,選挙種別,選挙区,氏名,ふりがな,新旧,年齢,性別,政党,推略,現職,...,当選情報,当選回数,ツイッターアカウント,フェイスブックＵＲＬ,ＨＰ、ブログＵＲＬ１,ＨＰ、ブログＵＲＬ２,ユーチューブ,ニコニコ動画,その他１,その他２
0,参院選挙区,北海道,柿木＝克弘,かき・き＝かつ・ひろ,新,48,男,自民,公明,県議,...,,0,,,http://www7.plala.or.jp/kakiki/,,,,,
1,参院選挙区,北海道,鉢呂＝吉雄,はち・ろ＝よし・お,新,68,男,民進,,なし,...,T,1,,https://www.facebook.com/hachiroyoshio,http://www.tsukuru.org/,,,,,
2,参院選挙区,北海道,長谷川＝岳,は・せ・がわ＝がく,現,45,男,自民,公明,参院選挙区,...,T,2,@gaku_hasegawa,https://ja-jp.facebook.com/hasegawagaku,http://www.hasegawagaku.jp/,http://ameblo.jp/hasegawa-gaku/,,,,
3,参院選挙区,北海道,水越＝寛陽,みず・こし＝かん・よう,新,36,男,無所,,なし,...,,0,,,,,,,,
4,参院選挙区,北海道,佐藤＝和夫,さ・とう＝かず・お,新,69,男,こ,,なし,...,,0,,https://www.facebook.com/kazuetu,,,,,,


In [4]:
kouhosha_senkyoku_2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225 entries, 0 to 224
Data columns (total 21 columns):
選挙種別          225 non-null object
選挙区           225 non-null object
氏名            225 non-null object
ふりがな          225 non-null object
新旧            225 non-null object
年齢            225 non-null int64
性別            225 non-null object
政党            225 non-null object
推略            85 non-null object
現職            225 non-null object
３２字略歴         225 non-null object
当選情報          73 non-null object
当選回数          225 non-null int64
ツイッターアカウント    113 non-null object
フェイスブックＵＲＬ    156 non-null object
ＨＰ、ブログＵＲＬ１    185 non-null object
ＨＰ、ブログＵＲＬ２    36 non-null object
ユーチューブ        35 non-null object
ニコニコ動画        2 non-null object
その他１          7 non-null object
その他２          1 non-null object
dtypes: int64(2), object(19)
memory usage: 37.0+ KB


## 前処理を行う

In [5]:
# カラム名を変える
kouhosha_senkyoku_2016 = kouhosha_senkyoku_2016.rename({"選挙種別": "category", 
                                                        "選挙区": "district", "氏名": "candidate_J", 
                                                        "ふりがな": "hurigana", "新旧": "status", 
                                                        "年齢": "age", "性別": "sex", "政党": "party",
                                                        "推略": "suisen", "現職": "genshoku",
                                                        "当選情報": "wl", "当選回数": "elected_count",
                                                        "ツイッターアカウント": "twitter", 
                                                        "フェイスブックＵＲＬ": "facebook",
                                                        "ＨＰ、ブログＵＲＬ１": "hp1", 
                                                        "ＨＰ、ブログＵＲＬ２": "hp2", "ユーチューブ": "youtube",
                                                        "ニコニコ動画": "niconico", 
                                                        "その他１": "others1", "その他２": "others2"}, axis=1)

# 使うカラムを選ぶ
kouhosha_senkyoku_2016 = kouhosha_senkyoku_2016[["district", "candidate_J", "status", "age", "sex", "party", 
                                                 "suisen", "genshoku", "wl", "elected_count", "twitter",
                                                 "facebook", "hp1", "hp2", "youtube", "niconico",
                                                 "others1", "others2"]]

In [6]:
kouhosha_senkyoku_2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225 entries, 0 to 224
Data columns (total 18 columns):
district         225 non-null object
candidate_J      225 non-null object
status           225 non-null object
age              225 non-null int64
sex              225 non-null object
party            225 non-null object
suisen           85 non-null object
genshoku         225 non-null object
wl               73 non-null object
elected_count    225 non-null int64
twitter          113 non-null object
facebook         156 non-null object
hp1              185 non-null object
hp2              36 non-null object
youtube          35 non-null object
niconico         2 non-null object
others1          7 non-null object
others2          1 non-null object
dtypes: int64(2), object(16)
memory usage: 31.7+ KB


Memo:
・足りないカラムはgrad, hereditary
・ダミー変数化する必要がある。
・2010年以前のデータに関してはcsvファイルがないので手作業が必要そうである。

## ダミー変数化する

In [7]:
kouhosha_senkyoku_2016["status"].unique()

array(['新', '現', '前', '元'], dtype=object)

In [8]:
# ダミー変数化
status_dummy = pd.get_dummies(kouhosha_senkyoku_2016["status"])
sex_dummy = pd.get_dummies(kouhosha_senkyoku_2016["sex"])
party_dummy = pd.get_dummies(kouhosha_senkyoku_2016["party"])
suisen_dummy = pd.get_dummies(kouhosha_senkyoku_2016["suisen"])
genshoku_dummy = pd.get_dummies(kouhosha_senkyoku_2016["genshoku"])
wl_dummy = pd.get_dummies(kouhosha_senkyoku_2016["wl"])

In [9]:
party_dummy

Unnamed: 0,お維,こ,公明,共産,幸福,怒り,支持,改革,民進,無所,社民,自民,諸派
0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,1,0,0,0,0
6,0,0,0,1,0,0,0,0,0,0,0,0,0
7,0,0,0,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,1,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,1,0,0,0


In [10]:
# カラム名を変える
status_dummy = status_dummy.rename(columns={"元": "previous", "新": "newcomer", 
                                            "現": "incumbent", "前": "predecessor"})
sex_dummy = sex_dummy.rename(columns={"女": "female"})
sex_dummy = sex_dummy.drop("男", axis=1)
party_dummy = party_dummy[["自民", "民進", "共産", "公明", "お維", "社民"]]
party_dummy = party_dummy.rename(columns={"自民": "ldp", "民進": "dpj", "共産": "jcp", "公明": "kom", "お維": "ishin", "社民": "syamin"})
suisen_dummy = suisen_dummy[["自民", "民主社民", "公明"]]
suisen_dummy = suisen_dummy.rename(columns={"自民": "suisen_ldp", "民主社民": "suisen_dpj", "公明": "suisen_kom"})
genshoku_dummy = genshoku_dummy[["参院選挙区", "参院比例区", "県議"]]
genshoku_dummy = genshoku_dummy.rename(columns={"参院選挙区": "councillors_senkyoku", 
                                                "参院比例区": "councillors_hireiku", "県議": "kengi"})
wl_dummy = wl_dummy.rename(columns={"T": "wl"})
kouhosha_senkyoku_2016 = kouhosha_senkyoku_2016.drop("wl", axis=1)

In [11]:
# 横に連結する
Data = pd.concat([kouhosha_senkyoku_2016, status_dummy,sex_dummy, party_dummy, suisen_dummy,
                   genshoku_dummy, wl_dummy], axis=1)

In [12]:
### SNSダミー

In [13]:
Data["twitter_dummy"] = Data["twitter"]
Data["twitter_dummy"] = Data["twitter_dummy"].fillna(0)
Data["twitter_dummy"] = Data['twitter_dummy'].mask(Data['twitter_dummy'] != 0, 1)

Data["facebook_dummy"] = Data["facebook"]
Data["facebook_dummy"] = Data["facebook_dummy"].fillna(0)
Data["facebook_dummy"] = Data['facebook_dummy'].mask(Data['facebook_dummy'] != 0, 1)

Data["youtube_dummy"] = Data["youtube"]
Data["youtube_dummy"] = Data["youtube_dummy"].fillna(0)
Data["youtube_dummy"] = Data['youtube_dummy'].mask(Data['youtube_dummy'] != 0, 1)

Data["SNS"] = Data["youtube_dummy"]*Data["twitter_dummy"]*Data["facebook_dummy"]

In [14]:
Data = Data.drop("predecessor", axis=1)

In [15]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225 entries, 0 to 224
Data columns (total 38 columns):
district                225 non-null object
candidate_J             225 non-null object
status                  225 non-null object
age                     225 non-null int64
sex                     225 non-null object
party                   225 non-null object
suisen                  85 non-null object
genshoku                225 non-null object
elected_count           225 non-null int64
twitter                 113 non-null object
facebook                156 non-null object
hp1                     185 non-null object
hp2                     36 non-null object
youtube                 35 non-null object
niconico                2 non-null object
others1                 7 non-null object
others2                 1 non-null object
previous                225 non-null uint8
newcomer                225 non-null uint8
incumbent               225 non-null uint8
female                  225 n

In [16]:
## ダミー変数化する前のカラムを捨てる
#Data = Data.drop(["status", "sex", "party", "suisen", "genshoku"], axis=1)

In [17]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225 entries, 0 to 224
Data columns (total 38 columns):
district                225 non-null object
candidate_J             225 non-null object
status                  225 non-null object
age                     225 non-null int64
sex                     225 non-null object
party                   225 non-null object
suisen                  85 non-null object
genshoku                225 non-null object
elected_count           225 non-null int64
twitter                 113 non-null object
facebook                156 non-null object
hp1                     185 non-null object
hp2                     36 non-null object
youtube                 35 non-null object
niconico                2 non-null object
others1                 7 non-null object
others2                 1 non-null object
previous                225 non-null uint8
newcomer                225 non-null uint8
incumbent               225 non-null uint8
female                  225 n

## 当選回数を整合的にする

In [18]:
Data["elected_count_adjusted"] = Data["elected_count"]-1
Data["elected_count"] = Data["elected_count_adjusted"].where(Data["wl"]==1, Data["elected_count"])

In [19]:
Data.to_excel("Data2016.xlsx", index=False)