# EDA（Exploratory Data Analysis）

In [10]:
import sklearn

ModuleNotFoundError: No module named 'sklearn'

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# データ読み込み
employee_df = pd.read_csv('./input_data/employee_information.csv')
display(employee_df.head(3))

In [None]:
# ヘッダーありデータでcolumn名を再定義したデータ読み込み
employee_df = pd.read_csv('./input_data/employee_information.csv', names = ['col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6'], header=0)
display(employee_df.head(3))

In [None]:
# indexを定義したデータ読み込み
employee_df = pd.read_csv('./input_data/employee_information.csv', index_col = 'First Name')
display(employee_df.head(3))

In [None]:
# indexをリセット
employee_df.reset_index(inplace=True)
display(employee_df.head(3))

In [None]:
# 行列数の確認
employee_df.shape

In [None]:
# データタイプの確認
employee_df.dtypes

In [None]:
# 基本統計量の確認
employee_df.describe()

In [None]:
# カラム名の確認
employee_df.columns

In [None]:
# 特定のカラムを欠損させる
employee_df = pd.read_csv('./input_data/employee_information.csv')
employee_df.iloc[2, 3] = np.nan
employee_df.iloc[4, 4] = np.nan

In [None]:
# 欠損値の有無
employee_df.isnull().any()

In [None]:
# 欠損値の数
employee_df.isnull().sum()

In [None]:
# 欠損値ではない数やデータ型を表示
employee_df.info()

In [None]:
# Nullデータ行を削除
employee_df.dropna(how = 'any')

In [None]:
# 特定列のNullデータ行を削除
employee_df.dropna(how = 'any', subset = 'Years with Company')

In [None]:
employee_df = pd.read_csv('./input_data/employee_information.csv')

In [None]:
# 特定行のみにアクセス
employee_df[5:8]

In [None]:
# 任意の列にデータを追加
employee_df.insert(0, column = 'Credit Score', value = [680, 700, 750, 699, 550, 600, 750, 500, 520, 510])
display(employee_df.head(3))

In [None]:
# 特定列を削除
employee_df.drop(labels = ["Last Name", "Salary"], axis = 1).head(3)

In [None]:
# 特定列を削除
del employee_df['Credit Score']
display(employee_df.head(3))

In [None]:
# intやfloat型だけを選択
employee_df.select_dtypes(include='number')

In [None]:
# strやlist, dict型を選択
employee_df.select_dtypes(include='object')

In [None]:
# 特定indexの行を選択
employee_df.loc[0]

In [None]:
# 特定indexの行を選択
employee_df.iloc[1]

In [None]:
# 特定indexの行を選択
employee_df.iloc[[2, 4, 9]]

In [None]:
# 特定indexの行を選択した後に特定列を選択
employee_df.iloc[4, 0:3]

In [None]:
# ランダムな行を選択
employee_df.sample(n = 5, axis = 0) # axis = 0 は行, axis = 1 は列を意味する

In [None]:
# ランダムな列を選択
employee_df.sample(n = 3, axis = 1).head(3) # axis = 0 は行, axis = 1 は列を意味する

In [None]:
# ソート
employee_df.sort_values('Years with Company', ascending=True).head() # ascending= Trueは昇順、Falseは降順

In [None]:
# 特定のカラムを欠損させる
employee_df = pd.read_csv('./input_data/employee_information.csv')
employee_df.iloc[1, 2] = np.nan
employee_df.head(3)

In [None]:
# 欠損値を平均で穴埋め
employee_df["Salary"].fillna(employee_df['Salary'].mean(), inplace = True)
employee_df.head(3)

In [None]:
# 重複削除
employee_df.drop_duplicates().head(3)

In [None]:
# 統計量を表示
hr_df = pd.read_csv('./input_data/Human_Resources.csv')
hr_df.hist(bins = 30, figsize = (20,20), color = 'r');

In [None]:
tmp_hr_df = hr_df.select_dtypes(include='number')
correlations = tmp_hr_df.corr()
f, ax = plt.subplots(figsize = (20, 20))
sns.heatmap(correlations, annot = True)