# Chapter 3: データ分析開始

## レシピ
* [データ分析ルーチンの開発](#レシピ20-データ分析ルーチンの開発)
* [データ型を変更してメモリ削減](#レシピ21-データ型を変更してメモリ削減)
* [最大の中の最小を選択](#レシピ22-最大の中の最小を選択)
* [ソートして各グループでの最大を選択](#レシピ23-ソートして各グループでの最大を選択)
* [sort_valuesでnlargestの代用](#レシピ24-sort_valuesでnlargestの代用)
* [トレール注文の価格計算](#レシピ25-トレール注文の価格計算)

In [1]:
import pandas as pd
import numpy as np
from IPython.display import display
pd.options.display.max_columns = 50

# レシピ20 データ分析ルーチンの開発

In [2]:
# データセットを読み込み、先頭を確認する
college = pd.read_csv('data/college.csv')
college.head()

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,4206.0,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,11383.0,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5
2,Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,291.0,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715,0.4536,1,0.6801,0.7795,0.854,40100,23370.0
3,University of Alabama in Huntsville,Huntsville,AL,0.0,0.0,0.0,0,595.0,590.0,0.0,5451.0,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035,0.2146,1,0.3072,0.4596,0.264,45500,24097.0
4,Alabama State University,Montgomery,AL,1.0,0.0,0.0,0,425.0,430.0,0.0,4811.0,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137,0.0892,1,0.7347,0.7554,0.127,26600,33118.5


In [5]:
# shpaeでDataFrameの次元を得る
college.shape

(7535, 27)

In [7]:
# infoメソッドでデータの型、非欠損値の個数、メモリ使用量を表示
college.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7535 entries, 0 to 7534
Data columns (total 27 columns):
INSTNM                7535 non-null object
CITY                  7535 non-null object
STABBR                7535 non-null object
HBCU                  7164 non-null float64
MENONLY               7164 non-null float64
WOMENONLY             7164 non-null float64
RELAFFIL              7535 non-null int64
SATVRMID              1185 non-null float64
SATMTMID              1196 non-null float64
DISTANCEONLY          7164 non-null float64
UGDS                  6874 non-null float64
UGDS_WHITE            6874 non-null float64
UGDS_BLACK            6874 non-null float64
UGDS_HISP             6874 non-null float64
UGDS_ASIAN            6874 non-null float64
UGDS_AIAN             6874 non-null float64
UGDS_NHPI             6874 non-null float64
UGDS_2MOR             6874 non-null float64
UGDS_NRA              6874 non-null float64
UGDS_UNKN             6874 non-null float64
PPTUG_EF          

In [8]:
# 数値カラムの要約統計量を求め、見やすいように横に転置する
college.describe(include=[np.number]).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,1.0
RELAFFIL,7535.0,0.190975,0.393096,0.0,0.0,0.0,0.0,1.0
SATVRMID,1185.0,522.819409,68.578862,290.0,475.0,510.0,555.0,765.0
SATMTMID,1196.0,530.76505,73.469767,310.0,482.0,520.0,565.0,785.0
DISTANCEONLY,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,1.0
UGDS,6874.0,2356.83794,5474.275871,0.0,117.0,412.5,1929.5,151558.0
UGDS_WHITE,6874.0,0.510207,0.286958,0.0,0.2675,0.5557,0.747875,1.0
UGDS_BLACK,6874.0,0.189997,0.224587,0.0,0.036125,0.10005,0.2577,1.0


In [9]:
# オブジェクトのカテゴリカラムの要約統計量を求める
college.describe(include=[np.object, pd.Categorical]).T

Unnamed: 0,count,unique,top,freq
INSTNM,7535,7535,Kenneth Shuler School of Cosmetology-Columbia,1
CITY,7535,2514,New York,87
STABBR,7535,59,CA,773
MD_EARN_WNE_P10,6413,598,PrivacySuppressed,822
GRAD_DEBT_MDN_SUPP,7503,2038,PrivacySuppressed,1510


In [10]:
# describeメソッドは数値カラムに対して正確なパーセントタイル指定が出来る
college.describe(include=[np.number],
                percentiles=[.01,.05, .10, .25, .5,
                            .75, .9, .95, .99]).T

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,95%,99%,max
HBCU,7164.0,0.014238,0.118478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
MENONLY,7164.0,0.009213,0.095546,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
WOMENONLY,7164.0,0.005304,0.072642,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
RELAFFIL,7535.0,0.190975,0.393096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
SATVRMID,1185.0,522.819409,68.578862,290.0,390.0,430.0,447.4,475.0,510.0,555.0,605.0,665.0,730.0,765.0
SATMTMID,1196.0,530.76505,73.469767,310.0,395.0,430.0,453.0,482.0,520.0,565.0,630.0,685.0,745.25,785.0
DISTANCEONLY,7164.0,0.005583,0.074519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
UGDS,6874.0,2356.83794,5474.275871,0.0,14.0,31.65,49.0,117.0,412.5,1929.5,6512.3,11858.05,26015.29,151558.0
UGDS_WHITE,6874.0,0.510207,0.286958,0.0,0.0,0.013265,0.06879,0.2675,0.5557,0.747875,0.86297,0.927315,1.0,1.0
UGDS_BLACK,6874.0,0.189997,0.224587,0.0,0.0,0.0,0.00753,0.036125,0.10005,0.2577,0.51571,0.726715,0.961467,1.0


In [11]:
# 大学データセットのデータ辞書
pd.read_csv('data/college_data_dictionary.csv')

Unnamed: 0,column_name,description
0,INSTNM,Institution Name
1,CITY,City Location
2,STABBR,State Abbreviation
3,HBCU,Historically Black College or University
4,MENONLY,0/1 Men Only
5,WOMENONLY,0/1 Women only
6,RELAFFIL,0/1 Religious Affiliation
7,SATVRMID,SAT Verbal Median
8,SATMTMID,SAT Math Median
9,DISTANCEONLY,Distance Education Only


# レシピ21 データ型を変更してメモリ削減

In [12]:
# いくつかのデータ型カラムを選択し、メモリ量をどれだけ減らせるか
different_cols = ['RELAFFIL','SATMTMID', 'CURROPER', 'INSTNM','STABBR']
col2 = college.loc[:,different_cols]
col2.head()

Unnamed: 0,RELAFFIL,SATMTMID,CURROPER,INSTNM,STABBR
0,0,420.0,1,Alabama A & M University,AL
1,0,565.0,1,University of Alabama at Birmingham,AL
2,1,,1,Amridge University,AL
3,0,590.0,1,University of Alabama in Huntsville,AL
4,0,430.0,1,Alabama State University,AL


In [14]:
# 各カラムのデータ型を示す
col2.dtypes

RELAFFIL      int64
SATMTMID    float64
CURROPER      int64
INSTNM       object
STABBR       object
dtype: object

In [15]:
# 各カラムのメモリ使用量を示す
original_men = col2.memory_usage(deep=True)
original_men

Index          128
RELAFFIL     60280
SATMTMID     60280
CURROPER     60280
INSTNM      660240
STABBR      444565
dtype: int64

In [17]:
# RELAFFILカラムを64ビット整数から8ビット整数に変換する
col2['RELAFFIL'] = col2['RELAFFIL'].astype(np.int8)
col2.dtypes

RELAFFIL       int8
SATMTMID    float64
CURROPER      int64
INSTNM       object
STABBR       object
dtype: object

In [19]:
# メモリ削減を確認
col2.memory_usage(deep=True)

Index          128
RELAFFIL      7535
SATMTMID     60280
CURROPER     60280
INSTNM      660240
STABBR      444565
dtype: int64

In [21]:
# メモリ量をさらに削減する為、object型の固有値が低いものをカテゴリ型に変更、まずは固有値の確認
col2.select_dtypes(include=['object']).nunique()

INSTNM    7535
STABBR      59
dtype: int64

In [23]:
# STABBRをカテゴリ型に変更する
col2['STABBR'] = col2['STABBR'].astype('category')
col2.dtypes

RELAFFIL        int8
SATMTMID     float64
CURROPER       int64
INSTNM        object
STABBR      category
dtype: object

In [26]:
new_mem = col2.memory_usage(deep=True)
new_mem

Index          128
RELAFFIL      7535
SATMTMID     60280
CURROPER     60280
INSTNM      660699
STABBR       13576
dtype: int64

In [28]:
# 元のメモリ量と更新したメモリ量を比較する
new_mem / original_men

Index       1.000000
RELAFFIL    0.125000
SATMTMID    1.000000
CURROPER    1.000000
INSTNM      1.000695
STABBR      0.030538
dtype: float64

In [29]:
# 全ての行インデックスを格納するInt64Indexと比べると、RangeIndexのメモリ使用量は極小
college.index = pd.Int64Index(college.index)
college.index.memory_usage() #RangeIndexでは128

60280

# レシピ22 最大の中の最小を選択

### 上位n個の値をもつデータのグループを1つのカラムにまとめ、この部分集合で別のカラムの下位m個の値を求める

In [30]:
# movieデータセットを読み込み、カラムを選択する
movie = pd.read_csv('data/movie.csv')
movie2 = movie[['movie_title', 'imdb_score', 'budget']]
movie2.head()

Unnamed: 0,movie_title,imdb_score,budget
0,Avatar,7.9,237000000.0
1,Pirates of the Caribbean: At World's End,7.1,300000000.0
2,Spectre,6.8,245000000.0
3,The Dark Knight Rises,8.5,250000000.0
4,Star Wars: Episode VII - The Force Awakens,7.1,


In [31]:
# nlargestメソッドを使い、imdb_scoreカラム上位100本の映画を選ぶ
movie2.nlargest(100, 'imdb_score').head()

Unnamed: 0,movie_title,imdb_score,budget
2725,Towering Inferno,9.5,
1920,The Shawshank Redemption,9.3,25000000.0
3402,The Godfather,9.2,6000000.0
2779,Dekalog,9.1,
4312,Kickboxer: Vengeance,9.1,17000000.0


In [32]:
# nsmallestメソッドをチェイニングして上位100本の映画から最安値の5本の映画を返す
movie2.nlargest(100, 'imdb_score').nsmallest(5, 'budget')

Unnamed: 0,movie_title,imdb_score,budget
4804,Butterfly Girl,8.7,180000.0
4801,Children of Heaven,8.5,180000.0
4706,12 Angry Men,8.9,350000.0
4550,A Separation,8.4,500000.0
4636,The Other Dream Team,8.4,500000.0


# レシピ23 ソートして各グループでの最大を選択

### 各年の最高評価を受けた映画を選ぶ

In [5]:
# 必要な3カラムだけにする
movie = pd.read_csv("data/movie.csv")
movie2 = movie[['movie_title', 'title_year', 'imdb_score']]

In [7]:
# sort_valuesメソッドでtitle_yearでソートする、デフォルトは昇順だが、降順にする
movie2.sort_values('title_year', ascending=False).head()

Unnamed: 0,movie_title,title_year,imdb_score
3884,The Veil,2016.0,4.7
2375,My Big Fat Greek Wedding 2,2016.0,6.1
2794,Miracles from Heaven,2016.0,6.8
92,Independence Day: Resurgence,2016.0,5.5
153,Kung Fu Panda 3,2016.0,7.2


In [8]:
# 複数カラムを同時にソートするにはリストを使う
movie3 = movie2.sort_values(['title_year', 'imdb_score'], ascending=False)
movie3.head()

Unnamed: 0,movie_title,title_year,imdb_score
4312,Kickboxer: Vengeance,2016.0,9.1
4277,A Beginner's Guide to Snuff,2016.0,8.7
3798,Airlift,2016.0,8.5
27,Captain America: Civil War,2016.0,8.2
98,Godzilla Resurgence,2016.0,8.2


In [10]:
# drop_duplivatesメソッドで年ごとに先頭の行だけを保持する
# subsetパラメータにカラムを渡すとそのカラムだけ比較する
movie_top_year = movie3.drop_duplicates(subset='title_year')
movie_top_year.head()

Unnamed: 0,movie_title,title_year,imdb_score
4312,Kickboxer: Vengeance,2016.0,9.1
3745,Running Forever,2015.0,8.6
4369,Queen of the Mountains,2014.0,8.7
3935,"Batman: The Dark Knight Returns, Part 2",2013.0,8.4
3,The Dark Knight Rises,2012.0,8.5


In [12]:
# あるカラムを昇順に、同時に別のカラムを降順にソート可能
# ascendingパラメータにBooleanリストを渡す
movie4 = movie[['movie_title', 'title_year', 'content_rating', 'budget']]
movie4_sorted = movie4.sort_values(['title_year', 'content_rating', 'budget'],
                                  ascending=[False, False, True])
movie4_sorted.drop_duplicates(subset=['title_year', 'content_rating']).head(10)

Unnamed: 0,movie_title,title_year,content_rating,budget
4026,Compadres,2016.0,R,3000000.0
4658,Fight to the Finish,2016.0,PG-13,150000.0
4661,Rodeo Girl,2016.0,PG,500000.0
3252,The Wailing,2016.0,Not Rated,
4659,Alleluia! The Devil's Carnival,2016.0,,500000.0
4731,Bizarre,2015.0,Unrated,500000.0
812,The Ridiculous 6,2015.0,TV-14,
4831,The Gallows,2015.0,R,100000.0
4825,Romantic Schemer,2015.0,PG-13,125000.0
3796,R.L. Stine's Monsterville: The Cabinet of Souls,2015.0,PG,4400000.0


# レシピ24 sort_valuesでnlargestの代用

### レシピ22をsort_valuesだけで実現する

In [14]:
movie = pd.read_csv("data/movie.csv")
movie2 = movie[['movie_title', 'imdb_score', 'budget']]

In [24]:
movie2.sort_values('imdb_score', ascending=False).head(100).sort_values('budget').head()

Unnamed: 0,movie_title,imdb_score,budget
4815,A Charlie Brown Christmas,8.4,150000.0
4801,Children of Heaven,8.5,180000.0
4804,Butterfly Girl,8.7,180000.0
4706,12 Angry Men,8.9,350000.0
4636,The Other Dream Team,8.4,500000.0


In [25]:
movie2.sort_values(['imdb_score','budget'], ascending=[False,False])

Unnamed: 0,movie_title,imdb_score,budget
2725,Towering Inferno,9.5,
1920,The Shawshank Redemption,9.3,25000000.0
3402,The Godfather,9.2,6000000.0
4312,Kickboxer: Vengeance,9.1,17000000.0
2779,Dekalog,9.1,
...,...,...,...
2240,Disaster Movie,1.9,25000000.0
2266,Superbabies: Baby Geniuses 2,1.9,20000000.0
4498,The Helix... Loaded,1.9,1000000.0
1126,Foodfight!,1.7,65000000.0


# レシピ25 トレール注文の価格計算

In [27]:
# tslaの株データを取得
import pandas_datareader as pdr
tsla = pdr.DataReader('tsla', data_source='yahoo', start='2017-1-1')
tsla.head(8)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-03,44.066002,42.192001,42.972,43.397999,29616500.0,43.397999
2017-01-04,45.599998,42.862,42.950001,45.397999,56067500.0,45.397999
2017-01-05,45.495998,44.389999,45.284,45.349998,29558500.0,45.349998
2017-01-06,46.062,45.09,45.386002,45.801998,27639500.0,45.801998
2017-01-09,46.383999,45.599998,45.793999,46.256001,19897500.0,46.256001
2017-01-10,46.400002,45.377998,46.400002,45.973999,18300000.0,45.973999
2017-01-11,45.995998,45.335999,45.813999,45.945999,18254000.0,45.945999
2017-01-12,46.139999,45.116001,45.812,45.917999,18951000.0,45.917999


In [28]:
# 取引日の終値を使う
tsla_close = tsla['Close']

In [29]:
# cummaxメソッドで今日までの終値の最高値をトラックする
tsla_cummax = tsla_close.cummax()
tsla_cummax.head(8)

Date
2017-01-03    43.397999
2017-01-04    45.397999
2017-01-05    45.397999
2017-01-06    45.801998
2017-01-09    46.256001
2017-01-10    46.256001
2017-01-11    46.256001
2017-01-12    46.256001
Name: Close, dtype: float64

In [30]:
# 損失を10%に限るため、tsla_cummaxに0.9を掛ける
tsla_cummax_stop = tsla_cummax * .9
tsla_cummax_stop.head(8)

Date
2017-01-03    39.058199
2017-01-04    40.858199
2017-01-05    40.858199
2017-01-06    41.221798
2017-01-09    41.630400
2017-01-10    41.630400
2017-01-11    41.630400
2017-01-12    41.630400
Name: Close, dtype: float64