# pandasのtips

In [18]:
import pandas as pd
import pandas_profiling as pdp
from sklearn import datasets

## テキストファイルの読み込み

* 適切な処理を行ったファイルを普通に読み込む場合は、pd.read_csv()で問題ない

* shift-jisになっていたり、機種依存文字が含まれていたりすると、read_csv()では読み込めないことがあるので、注意が必要

In [3]:
# 適切な処理を行ったファイルの読み込み
df_1 = pd.read_csv("./data/pandas_tips_testdata_1.txt", sep='\t')
df_1.head()

Unnamed: 0,id,user_id,name,create_time,update_time
0,1,111111111,あああ,2018-05-01 00:00:00,2018-05-01 00:00:00
1,2,222222222,いいい,2018-05-01 00:00:00,2018-05-01 00:00:00
2,3,333333333,ううう,2018-05-01 00:00:00,2018-05-01 00:00:00
3,4,444444444,えええ,2018-05-01 00:00:00,2018-05-01 00:00:00
4,5,555555555,おおお,2018-05-01 00:00:00,2018-05-01 00:00:00


## shift-jisで機種依存文字が入っているファイルの読み込み

shift-jisのファイルは、読み込み時にエンコードを指定すれば読み込める場合が多い

機種依存文字があると、エンコードを指定しても読み込めない

このような場合はopen関数でerrors="ignore"を指定して、機種依存文字を読み込んでから、pandasに渡せばよい

組み込みのopenでも、codecs.openでも読み込み可

この際、機種依存文字は削除された状態で読み込まれる


## nullが多いファイルの読み込み

上記の機種依存文字があるようなファイルを読み込む際、右端のカラムにnullが存在すると、DataFrameに渡す際にカラム数が合わないというエラーが出る

この場合は、カラム名のリストを渡すことで、カラム数を明示して読み込むことができる


## 巨大なファイルの読み込み

また、巨大なファイルを扱う場合にデータが疎なカラムが存在すると、最初に推測したデータ型が後から出てきたデータに合わずにエラーが出ることがある

この際は、読み込み時にdtypeを指定しておくことで、エラーを回避できる

In [12]:
# 機種依存文字があるshift-jisファイルの読み込み
with open("./data/pandas_tips_testdata_2.txt", mode="r", encoding="Shift-JIS", errors="ignore") as file:
    # カラムのリストを作成
    col_names = file.readline()
    col_list = col_names.replace("\r", "")\
                    .replace("\n", "")\
                    .split("\t")
    # DataFrameに変換
    df_2 = pd.read_table(file
                         , delimiter="\t" # 区切り文字の指定
                         , names=col_list # カラム名の明示
                         , dtype={"id": int, "create_time": str} # データ型の指定
                        )

df_2.head()

Unnamed: 0,id,user_id,name,create_time,update_time
0,1,111111111,あああ,2018-05-01 00:00:00,2018-05-01 00:00:00
1,2,222222222,いいい,2018-05-01 00:00:00,2018-05-01 00:00:00
2,3,333333333,ううう,2018-05-01 00:00:00,2018-05-01 00:00:00
3,4,444444444,えええ,2018-05-01 00:00:00,2018-05-01 00:00:00
4,5,555555555,おおお,2018-05-01 00:00:00,2018-05-01 00:00:00


## データの確認

pandasにはデータを確認するメソッドがいくつか用意されている

* DataFrame.info()

* DataFrame.describe()

* pandas-profiling  
┗https://qiita.com/h_kobayashi1125/items/02039e57a656abe8c48f



In [20]:
# irisのデータを使用
iris = datasets.load_iris()
df_iris = pd.DataFrame(iris.data, columns=iris.feature_names)
df_iris["target"] = iris.target_names[iris.target]
df_iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [21]:
df_iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal length (cm)    150 non-null float64
sepal width (cm)     150 non-null float64
petal length (cm)    150 non-null float64
petal width (cm)     150 non-null float64
target               150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB


In [22]:
df_iris.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [23]:
pdp.ProfileReport(df_iris)

0,1
Number of variables,5
Number of observations,150
Total Missing (%),0.0%
Total size in memory,5.9 KiB
Average record size in memory,40.5 B

0,1
Numeric,3
Categorical,1
Boolean,0
Date,0
Text (Unique),0
Rejected,1
Unsupported,0

0,1
Distinct count,43
Unique (%),28.7%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,3.7587
Minimum,1
Maximum,6.9
Zeros (%),0.0%

0,1
Minimum,1.0
5-th percentile,1.3
Q1,1.6
Median,4.35
Q3,5.1
95-th percentile,6.1
Maximum,6.9
Range,5.9
Interquartile range,3.5

0,1
Standard deviation,1.7644
Coef of variation,0.46943
Kurtosis,-1.4019
Mean,3.7587
MAD,1.5619
Skewness,-0.27446
Sum,563.8
Variance,3.1132
Memory size,1.2 KiB

Value,Count,Frequency (%),Unnamed: 3
1.5,14,9.3%,
1.4,12,8.0%,
5.1,8,5.3%,
4.5,8,5.3%,
1.3,7,4.7%,
1.6,7,4.7%,
5.6,6,4.0%,
4.0,5,3.3%,
4.9,5,3.3%,
4.7,5,3.3%,

Value,Count,Frequency (%),Unnamed: 3
1.0,1,0.7%,
1.1,1,0.7%,
1.2,2,1.3%,
1.3,7,4.7%,
1.4,12,8.0%,

Value,Count,Frequency (%),Unnamed: 3
6.3,1,0.7%,
6.4,1,0.7%,
6.6,1,0.7%,
6.7,2,1.3%,
6.9,1,0.7%,

0,1
Correlation,0.96276

0,1
Distinct count,35
Unique (%),23.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,5.8433
Minimum,4.3
Maximum,7.9
Zeros (%),0.0%

0,1
Minimum,4.3
5-th percentile,4.6
Q1,5.1
Median,5.8
Q3,6.4
95-th percentile,7.255
Maximum,7.9
Range,3.6
Interquartile range,1.3

0,1
Standard deviation,0.82807
Coef of variation,0.14171
Kurtosis,-0.55206
Mean,5.8433
MAD,0.68756
Skewness,0.31491
Sum,876.5
Variance,0.68569
Memory size,1.2 KiB

Value,Count,Frequency (%),Unnamed: 3
5.0,10,6.7%,
6.3,9,6.0%,
5.1,9,6.0%,
6.7,8,5.3%,
5.7,8,5.3%,
5.5,7,4.7%,
5.8,7,4.7%,
6.4,7,4.7%,
6.0,6,4.0%,
4.9,6,4.0%,

Value,Count,Frequency (%),Unnamed: 3
4.3,1,0.7%,
4.4,3,2.0%,
4.5,1,0.7%,
4.6,4,2.7%,
4.7,2,1.3%,

Value,Count,Frequency (%),Unnamed: 3
7.3,1,0.7%,
7.4,1,0.7%,
7.6,1,0.7%,
7.7,4,2.7%,
7.9,1,0.7%,

0,1
Distinct count,23
Unique (%),15.3%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,3.054
Minimum,2
Maximum,4.4
Zeros (%),0.0%

0,1
Minimum,2.0
5-th percentile,2.345
Q1,2.8
Median,3.0
Q3,3.3
95-th percentile,3.8
Maximum,4.4
Range,2.4
Interquartile range,0.5

0,1
Standard deviation,0.43359
Coef of variation,0.14198
Kurtosis,0.29078
Mean,3.054
MAD,0.33309
Skewness,0.33405
Sum,458.1
Variance,0.188
Memory size,1.2 KiB

Value,Count,Frequency (%),Unnamed: 3
3.0,26,17.3%,
2.8,14,9.3%,
3.2,13,8.7%,
3.4,12,8.0%,
3.1,12,8.0%,
2.9,10,6.7%,
2.7,9,6.0%,
2.5,8,5.3%,
3.5,6,4.0%,
3.8,6,4.0%,

Value,Count,Frequency (%),Unnamed: 3
2.0,1,0.7%,
2.2,3,2.0%,
2.3,4,2.7%,
2.4,3,2.0%,
2.5,8,5.3%,

Value,Count,Frequency (%),Unnamed: 3
3.9,2,1.3%,
4.0,1,0.7%,
4.1,1,0.7%,
4.2,1,0.7%,
4.4,1,0.7%,

0,1
Distinct count,3
Unique (%),2.0%
Missing (%),0.0%
Missing (n),0

0,1
virginica,50
versicolor,50
setosa,50

Value,Count,Frequency (%),Unnamed: 3
virginica,50,33.3%,
versicolor,50,33.3%,
setosa,50,33.3%,

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


## データハンドリングあれこれ

In [26]:
# str型の時系列データをdatetime型に変換
df_2["create_time"] = pd.to_datetime(df_2["create_time"])
df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
id             5 non-null int32
user_id        5 non-null int64
name           5 non-null object
create_time    5 non-null datetime64[ns]
update_time    5 non-null object
dtypes: datetime64[ns](1), int32(1), int64(1), object(2)
memory usage: 260.0+ bytes


In [27]:
# select distinct的な
df_2["create_time"].unique()

array(['2018-05-01T00:00:00.000000000'], dtype='datetime64[ns]')