# 選挙区候補者の前処理を行う

## ライブラリのインポート

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

# 警告文を非表示に
import warnings
warnings.filterwarnings('ignore')
#カラム内の文字数。デフォルトは50だった
pd.set_option("display.max_colwidth", 100)
#行数
pd.set_option("display.max_rows", 100)

## データのインポート

In [2]:
kouhosha_senkyoku_2013 = pd.read_excel("kouhosha_senkyoku_2013.xlsx")

## データの確認

In [3]:
kouhosha_senkyoku_2013.head()

Unnamed: 0,選挙種別,都道府県,選挙区,氏名,ふりがな,新旧,年齢,性別,政党,推略,...,当選情報,当選回数,Twitter,Facebook,HP1,HP2,YouTube,ニコニコ動画,その他１,その他２
0,参院選挙区,北海道,北海道,小川＝勝也,お・がわ＝かつ・や,現,50,男,民主,,...,T,4,@katsusikanai,https://www.facebook.com/katsusikanai,http://www.ogawa-k.net/,,http://www.youtube.com/user/KatsuyaOgawaOffice,,,
1,参院選挙区,北海道,北海道,森山＝佳則,もり・やま＝よし・のり,新,46,男,幸福,,...,,0,@YosshiiMoriyama,https://www.facebook.com/moriyama.hokkaido,https://sites.google.com/site/kofukumoriyama/home,http://moriyama-yoshinori-hrp.blogspot.jp/,https://www.youtube.com/user/ChannelMoriyama7,,,
2,参院選挙区,北海道,北海道,伊達＝忠一,だ・て＝ちゅう・いち,現,74,男,自民,公明,...,T,3,,http://www.facebook.com/datechu.jp,http://www.datechu.jp/,,,,,
3,参院選挙区,北海道,北海道,森＝英士,もり＝つね・と,新,35,男,共産,,...,,0,,http://www.facebook.com/tsuneto.mori,http://www.jcphkdbl.gr.jp,http://morimori.blog-mmo.com/,,,,
4,参院選挙区,北海道,北海道,安住＝太伸,あ・ずみ＝たか・のぶ,新,43,男,みん,,...,,0,@azumitakanobu,https://www.facebook.com/takanobu.azumi,http://www.az3.net/,,http://www.youtube.com/user/yourpartyhokkaido,,,


In [4]:
kouhosha_senkyoku_2013.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271 entries, 0 to 270
Data columns (total 23 columns):
選挙種別        271 non-null object
都道府県        271 non-null object
選挙区         271 non-null object
氏名          271 non-null object
ふりがな        271 non-null object
新旧          271 non-null object
年齢          271 non-null int64
性別          271 non-null object
政党          271 non-null object
推略          60 non-null object
現職          271 non-null object
１３字略歴       271 non-null object
３２字略歴       271 non-null object
当選情報        73 non-null object
当選回数        271 non-null int64
Twitter     182 non-null object
Facebook    225 non-null object
HP1         254 non-null object
HP2         79 non-null object
YouTube     86 non-null object
ニコニコ動画      5 non-null object
その他１        22 non-null object
その他２        2 non-null object
dtypes: int64(2), object(21)
memory usage: 48.8+ KB


## 前処理を行う

In [5]:
# カラム名を変える
kouhosha_senkyoku_2013 = kouhosha_senkyoku_2013.rename({"選挙種別": "category", "都道府県": "todohuken",
                                                        "選挙区": "district", "氏名": "candidate_J", 
                                                        "ふりがな": "hurigana", "新旧": "status", 
                                                        "年齢": "age", "性別": "sex", "政党": "party",
                                                        "推略": "suisen", "現職": "genshoku",
                                                        "当選情報": "wl", "当選回数": "elected_count",
                                                        "Twitter": "twitter", "Facebook": "facebook",
                                                        "HP1": "hp1", "HP2": "hp2", "YouTube": "youtube",
                                                        "ニコニコ動画": "niconico", 
                                                        "その他１": "others1", "その他２": "others2"}, axis=1)

# 使うカラムを選ぶ
kouhosha_senkyoku_2013 = kouhosha_senkyoku_2013[["district", "candidate_J", "status", "age", "sex", "party", 
                                                 "suisen", "genshoku", "wl", "elected_count", "twitter",
                                                 "facebook", "hp1", "hp2", "youtube", "niconico",
                                                 "others1", "others2"]]

In [6]:
kouhosha_senkyoku_2013.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271 entries, 0 to 270
Data columns (total 18 columns):
district         271 non-null object
candidate_J      271 non-null object
status           271 non-null object
age              271 non-null int64
sex              271 non-null object
party            271 non-null object
suisen           60 non-null object
genshoku         271 non-null object
wl               73 non-null object
elected_count    271 non-null int64
twitter          182 non-null object
facebook         225 non-null object
hp1              254 non-null object
hp2              79 non-null object
youtube          86 non-null object
niconico         5 non-null object
others1          22 non-null object
others2          2 non-null object
dtypes: int64(2), object(16)
memory usage: 38.2+ KB


Memo:
・足りないカラムはgrad, hereditary
・ダミー変数化する必要がある。
・2010年以前のデータに関してはcsvファイルがないので手作業が必要そうである。

## ダミー変数化する

In [7]:
kouhosha_senkyoku_2013["status"].unique()

array(['現', '新', '元'], dtype=object)

In [8]:
# ダミー変数化
status_dummy = pd.get_dummies(kouhosha_senkyoku_2013["status"])
sex_dummy = pd.get_dummies(kouhosha_senkyoku_2013["sex"])
party_dummy = pd.get_dummies(kouhosha_senkyoku_2013["party"])
suisen_dummy = pd.get_dummies(kouhosha_senkyoku_2013["suisen"])
genshoku_dummy = pd.get_dummies(kouhosha_senkyoku_2013["genshoku"])
wl_dummy = pd.get_dummies(kouhosha_senkyoku_2013["wl"])

In [9]:
# カラム名を変える
status_dummy = status_dummy.rename(columns={"元": "previous", "新": "newcomer", "現": "incumbent"})
sex_dummy = sex_dummy.rename(columns={"女": "female"})
sex_dummy = sex_dummy.drop("男", axis=1)
party_dummy = party_dummy[["自民", "民主", "共産", "公明", "維新", "社民"]]
party_dummy = party_dummy.rename(columns={"自民": "ldp", "民主": "dpj", "共産": "jcp", "公明": "kom", "維新": "ishin", "社民": "syamin"})
suisen_dummy = suisen_dummy[["自民", "民主", "公明"]]
suisen_dummy = suisen_dummy.rename(columns={"自民": "suisen_ldp", "民主": "suisen_dpj", "公明": "suisen_kom"})
genshoku_dummy = genshoku_dummy[["参院選挙区", "参院比例区", "県議"]]
genshoku_dummy = genshoku_dummy.rename(columns={"参院選挙区": "councillors_senkyoku", "参院比例区": "councillors_hireiku", "県議": "kengi"})
wl_dummy = wl_dummy.rename(columns={"T": "wl"})
kouhosha_senkyoku_2013 = kouhosha_senkyoku_2013.drop("wl", axis=1)

In [10]:
# 横に連結する
Data = pd.concat([kouhosha_senkyoku_2013, status_dummy,sex_dummy, party_dummy, suisen_dummy,
                   genshoku_dummy, wl_dummy], axis=1)

In [11]:
# SNSダミー

In [12]:
# twitter, facebook, youtubeを全てやっている候補者に1を、そうでない候補者には0をあてる
Data["twitter_dummy"] = Data["twitter"]
Data["twitter_dummy"] = Data["twitter_dummy"].fillna(0)
Data["twitter_dummy"] = Data['twitter_dummy'].mask(Data['twitter_dummy'] != 0, 1)

Data["facebook_dummy"] = Data["facebook"]
Data["facebook_dummy"] = Data["facebook_dummy"].fillna(0)
Data["facebook_dummy"] = Data['facebook_dummy'].mask(Data['facebook_dummy'] != 0, 1)

Data["youtube_dummy"] = Data["youtube"]
Data["youtube_dummy"] = Data["youtube_dummy"].fillna(0)
Data["youtube_dummy"] = Data['youtube_dummy'].mask(Data['youtube_dummy'] != 0, 1)

Data["SNS"] = Data["youtube_dummy"]*Data["twitter_dummy"]*Data["facebook_dummy"]

In [13]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271 entries, 0 to 270
Data columns (total 38 columns):
district                271 non-null object
candidate_J             271 non-null object
status                  271 non-null object
age                     271 non-null int64
sex                     271 non-null object
party                   271 non-null object
suisen                  60 non-null object
genshoku                271 non-null object
elected_count           271 non-null int64
twitter                 182 non-null object
facebook                225 non-null object
hp1                     254 non-null object
hp2                     79 non-null object
youtube                 86 non-null object
niconico                5 non-null object
others1                 22 non-null object
others2                 2 non-null object
previous                271 non-null uint8
newcomer                271 non-null uint8
incumbent               271 non-null uint8
female                  271 

In [14]:
## ダミー変数化する前のカラムを捨てる
#Data = Data.drop(["status", "sex", "party", "suisen", "genshoku"], axis=1)

## 当選回数を整合的にする

In [15]:
Data["elected_count_adjusted"] = Data["elected_count"]-1
Data["elected_count"] = Data["elected_count_adjusted"].where(Data["wl"]==1, Data["elected_count"])

In [16]:
Data.to_excel("Data2013.xlsx", index=False)