Modulo basico - Analise exploratoria de Dados com Python

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# importando a base inicial
df = pd.read_csv('../../data/train.csv')

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,712833.0,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7925.0,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,300.0,,S


Entendendo os tipos de variaves que temos na base

In [5]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [6]:
# criando duas listas com nomes das colunas, uma para colinas qualitativas e outra para quantitativas

quali = []
quanti = []

for i in df.dtypes.index:
    if df.dtypes[i] == 'object': # type: ignore
        quali.append(i)
    else:
        quanti.append(i)

In [7]:
print('Lista Quanti: ', quanti)
print('Lista Quali: ', quali)

Lista Quanti:  ['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Lista Quali:  ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


In [8]:
# criando um dataframe para as variaveis quanti para fazer estatistica descritiva

df_quanti = df[quanti]

In [9]:
df_quanti

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
0,1,0,3,22.0,1,0,7.25
1,2,1,1,38.0,1,0,712833.00
2,3,1,3,26.0,0,0,7925.00
3,4,1,1,35.0,1,0,53.10
4,5,0,3,35.0,0,0,300.00
...,...,...,...,...,...,...,...
886,887,0,2,27.0,0,0,13.00
887,888,1,1,19.0,0,0,30.00
888,889,0,3,,1,2,23.45
889,890,1,1,26.0,0,0,30.00


In [10]:
# estatistica descritiva
df_quanti.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.717325,0.523008,0.381594,742459.8
std,257.353842,0.486592,0.836071,14.591695,1.102743,0.806057,5995953.0
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.0,0.0,0.0,26.55
50%,446.0,0.0,3.0,28.0,0.0,0.0,7333.0
75%,668.5,1.0,3.0,38.75,1.0,0.0,254667.0
max,891.0,1.0,3.0,80.0,8.0,6.0,124091300.0


In [11]:
# calculando mediana que nao esta no comando anterior
df_quanti.median()

PassengerId     446.0
Survived          0.0
Pclass            3.0
Age              28.0
SibSp             0.0
Parch             0.0
Fare           7333.0
dtype: float64

In [12]:
# Tabela de frequencia para as variaveis qualitativas

df_quali = df[quali]

# fazendo a tabela de frequencia para a variavel Sex

df_quali.groupby('Sex').Name.count()

Sex
female    314
male      577
Name: Name, dtype: int64

In [14]:
# fazendo todas as tttabelas de frequencia de uma evz so
# vamos desconsiderar a variavel Name que sera unica para cad passageiro

for i in df_quali.columns:
    if i == 'Name':
        pass
    else:
        print('---------------')
        print('Variavel: ', i)
        print(df_quali.groupby(i).Name.count())
        print('---------------')

---------------
Variavel:  Sex
Sex
female    314
male      577
Name: Name, dtype: int64
---------------
---------------
Variavel:  Ticket
Ticket
110152         3
110413         3
110465         2
110564         1
110813         1
              ..
W./C. 6608     4
W./C. 6609     1
W.E.P. 5734    1
W/C 14208      1
WE/P 5735      2
Name: Name, Length: 681, dtype: int64
---------------
---------------
Variavel:  Cabin
Cabin
A10    1
A14    1
A16    1
A19    1
A20    1
      ..
F33    3
F38    1
F4     2
G6     4
T      1
Name: Name, Length: 147, dtype: int64
---------------
---------------
Variavel:  Embarked
Embarked
C    168
Q     77
S    644
Name: Name, dtype: int64
---------------


Deteccao de dados nulos

In [15]:
# para cada variavel no df, iremos pegar a quantidade de dados nulos
# visualizar quantos nulos temos em cada variavel
# criar uma tabela para visualizar isso (um data frame)

nulos = pd.DataFrame()
nulos['Variavel'] =  df.columns

In [16]:
nulos

Unnamed: 0,Variavel
0,PassengerId
1,Survived
2,Pclass
3,Name
4,Sex
5,Age
6,SibSp
7,Parch
8,Ticket
9,Fare


In [30]:
nulos = (
    df.isna().sum()
      .reset_index()
      .rename(columns={"index": "Variavel", 0: "Quantidade"})
)
nulos["Porcentagem"] = ((nulos["Quantidade"] / df["PassengerId"].count()) * 100).round(2)


In [31]:
nulos

Unnamed: 0,Variavel,Quantidade,Porcentagem
0,PassengerId,0,0.0
1,Survived,0,0.0
2,Pclass,0,0.0
3,Name,0,0.0
4,Sex,0,0.0
5,Age,177,19.87
6,SibSp,0,0.0
7,Parch,0,0.0
8,Ticket,0,0.0
9,Fare,0,0.0
