# Medidas de posição e dispersão

## Base de dados

In [3]:
import numpy as np
import statistics
from scipy import stats
import math
import pandas as pd

In [31]:
dados = np.array([150, 151, 152, 152, 153, 154, 155, 155, 155, 155, 156, 156, 156,
                  157, 158, 158, 160, 160, 160, 160, 160, 161, 161, 161, 161, 162,
                  163, 163, 164, 164, 164, 165, 166, 167, 168, 168, 169, 170, 172,
                  173])

In [None]:
dados.shape

(40,)

## Média aritmética simples

In [None]:
len(dados)

40

In [None]:
dados.sum() / len(dados)

160.375

In [None]:
dados.sum() / len(dados)

160.375

In [None]:
dados.mean()

160.375

In [None]:
dados.mean()

160.375

In [None]:
statistics.mean(dados)

160

In [None]:
statistics.mean(dados)

160

## Moda

In [None]:
statistics.mode(dados)

160

In [None]:
stats.mode(dados)

ModeResult(mode=array([160]), count=array([5]))

## Mediana

In [None]:
dados_impar = [150, 151, 152, 152, 153, 154, 155, 155, 155]

### Cálculo manual (ímpar)

In [None]:
posicao = len(dados_impar) / 2
posicao

4.5

In [None]:
posicao = math.ceil(posicao)
posicao

5

In [None]:
dados_impar[posicao - 1]

153

### Cálculo manual (par)

In [None]:
posicao = len(dados) // 2
posicao

20

In [None]:
dados[posicao - 1], dados[posicao]

(160, 160)

In [None]:
mediana = (dados[posicao - 1] + dados[posicao]) / 2
mediana

160.0

### Bibliotecas

In [None]:
np.median(dados_impar)

153.0

In [None]:
np.median(dados)

160.0

In [None]:
statistics.median(dados_impar)

153

In [None]:
statistics.median(dados)

160.0

## Média aritmética ponderada

In [None]:
notas = np.array([9, 8, 7, 3])
pesos = np.array([1, 2, 3, 4])

In [None]:
(9 * 1 + 8 * 2 + 7 * 3 + 3 * 4) / (1 + 2 + 3 + 4)

5.8

In [None]:
notas

array([9, 8, 7, 3])

In [None]:
pesos

array([1, 2, 3, 4])

In [None]:
#notas * pesos
(notas * pesos).sum() /  pesos.sum() 

5.8

In [None]:
media_ponderada = (notas * pesos).sum() / pesos.sum()
media_ponderada

5.8

In [None]:
np.average(notas, weights=pesos)

5.8

## Média aritmética, moda e mediana com distribuição de frequência (dados agrupados)

In [43]:
dados1 = {'inferior': [150, 154, 158, 162, 166, 170],
         'superior': [154, 158, 162, 166, 170, 174],
         'fi': [5, 9, 11, 7, 5, 3]}

In [44]:
import pandas as pd
dataset = pd.DataFrame(dados1)
dataset

Unnamed: 0,inferior,superior,fi
0,150,154,5
1,154,158,9
2,158,162,11
3,162,166,7
4,166,170,5
5,170,174,3


In [45]:
dataset['xi'] = (dataset['superior'] + dataset['inferior']) / 2
dataset

Unnamed: 0,inferior,superior,fi,xi
0,150,154,5,152.0
1,154,158,9,156.0
2,158,162,11,160.0
3,162,166,7,164.0
4,166,170,5,168.0
5,170,174,3,172.0


In [None]:
dataset['fi.xi'] = dataset['fi'] * dataset['xi']
dataset

Unnamed: 0,inferior,superior,fi,xi,fi.xi
0,150,154,5,152.0,760.0
1,154,158,9,156.0,1404.0
2,158,162,11,160.0,1760.0
3,162,166,7,164.0,1148.0
4,166,170,5,168.0,840.0
5,170,174,3,172.0,516.0


In [None]:
dataset['Fi']

0     5.0
1    14.0
2    25.0
3    32.0
4    37.0
5    40.0
Name: Fi, dtype: float64

In [None]:
dataset.describe()

Unnamed: 0,inferior,superior,fi,xi,fi.xi,Fi
count,6.0,6.0,6.0,6.0,6.0,6.0
mean,160.0,164.0,6.666667,162.0,1071.333333,25.5
std,7.483315,7.483315,2.94392,7.483315,458.180168,13.693064
min,150.0,154.0,3.0,152.0,516.0,5.0
25%,155.0,159.0,5.0,157.0,780.0,16.75
50%,160.0,164.0,6.0,162.0,994.0,28.5
75%,165.0,169.0,8.5,167.0,1340.0,35.75
max,170.0,174.0,11.0,172.0,1760.0,40.0


In [None]:
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
inferior,6.0,160.0,7.483315,150.0,155.0,160.0,165.0,170.0
superior,6.0,164.0,7.483315,154.0,159.0,164.0,169.0,174.0
fi,6.0,6.666667,2.94392,3.0,5.0,6.0,8.5,11.0
xi,6.0,162.0,7.483315,152.0,157.0,162.0,167.0,172.0
fi.xi,6.0,1071.333333,458.180168,516.0,780.0,994.0,1340.0,1760.0
Fi,6.0,25.5,13.693064,5.0,16.75,28.5,35.75,40.0


In [None]:
dataset['Fi'] = 0
dataset

Unnamed: 0,inferior,superior,fi,xi,fi.xi,Fi
0,150,154,5,152.0,760.0,0
1,154,158,9,156.0,1404.0,0
2,158,162,11,160.0,1760.0,0
3,162,166,7,164.0,1148.0,0
4,166,170,5,168.0,840.0,0
5,170,174,3,172.0,516.0,0


In [None]:
frequencia_acumulada = []
somatorio = 0
for linha in dataset.iterrows():
  print(linha[1])
  print(linha[1][2])
  somatorio += linha[1][2]
  frequencia_acumulada.append(somatorio)

inferior    150.0
superior    154.0
fi            5.0
xi          152.0
fi.xi       760.0
Fi            0.0
Name: 0, dtype: float64
5.0
inferior     154.0
superior     158.0
fi             9.0
xi           156.0
fi.xi       1404.0
Fi             0.0
Name: 1, dtype: float64
9.0
inferior     158.0
superior     162.0
fi            11.0
xi           160.0
fi.xi       1760.0
Fi             0.0
Name: 2, dtype: float64
11.0
inferior     162.0
superior     166.0
fi             7.0
xi           164.0
fi.xi       1148.0
Fi             0.0
Name: 3, dtype: float64
7.0
inferior    166.0
superior    170.0
fi            5.0
xi          168.0
fi.xi       840.0
Fi            0.0
Name: 4, dtype: float64
5.0
inferior    170.0
superior    174.0
fi            3.0
xi          172.0
fi.xi       516.0
Fi            0.0
Name: 5, dtype: float64
3.0


In [None]:
frequencia_acumulada

[5.0, 14.0, 25.0, 32.0, 37.0, 40.0]

In [None]:
dataset['Fi'] = frequencia_acumulada
dataset

Unnamed: 0,inferior,superior,fi,xi,fi.xi,Fi
0,150,154,5,152.0,760.0,5.0
1,154,158,9,156.0,1404.0,14.0
2,158,162,11,160.0,1760.0,25.0
3,162,166,7,164.0,1148.0,32.0
4,166,170,5,168.0,840.0,37.0
5,170,174,3,172.0,516.0,40.0


In [None]:
dataset.describe()

Unnamed: 0,inferior,superior,fi,xi,fi.xi,Fi
count,6.0,6.0,6.0,6.0,6.0,6.0
mean,160.0,164.0,6.666667,162.0,1071.333333,25.5
std,7.483315,7.483315,2.94392,7.483315,458.180168,13.693064
min,150.0,154.0,3.0,152.0,516.0,5.0
25%,155.0,159.0,5.0,157.0,780.0,16.75
50%,160.0,164.0,6.0,162.0,994.0,28.5
75%,165.0,169.0,8.5,167.0,1340.0,35.75
max,170.0,174.0,11.0,172.0,1760.0,40.0


In [None]:
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
inferior,6.0,160.0,7.483315,150.0,155.0,160.0,165.0,170.0
superior,6.0,164.0,7.483315,154.0,159.0,164.0,169.0,174.0
fi,6.0,6.666667,2.94392,3.0,5.0,6.0,8.5,11.0
xi,6.0,162.0,7.483315,152.0,157.0,162.0,167.0,172.0
fi.xi,6.0,1071.333333,458.180168,516.0,780.0,994.0,1340.0,1760.0
Fi,6.0,25.5,13.693064,5.0,16.75,28.5,35.75,40.0


### Média

In [None]:
a = dataset['fi.xi'].sum()

In [None]:
b = dataset['fi'].sum()

In [None]:
a / b

160.7

In [None]:
a // b

160.0

In [None]:
dataset['fi'].sum()

40

In [None]:
dataset['fi'].sum(), dataset['fi.xi'].sum()

(40, 6428.0)

In [None]:
dataset['fi.xi'].sum() / dataset['fi'].sum()

160.7

### Moda

In [None]:
dataset['fi'].max()

11

In [None]:
dataset[dataset['fi'] == dataset['fi'].max()]

Unnamed: 0,inferior,superior,fi,xi,fi.xi,Fi
2,158,162,11,160.0,1760.0,25.0


In [None]:
dataset[dataset['fi'] == dataset['fi'].max()]['xi'].values[0]

160.0

### Mediana

In [None]:
dataset

Unnamed: 0,inferior,superior,fi,xi,fi.xi,Fi
0,150,154,5,152.0,760.0,5.0
1,154,158,9,156.0,1404.0,14.0
2,158,162,11,160.0,1760.0,25.0
3,162,166,7,164.0,1148.0,32.0
4,166,170,5,168.0,840.0,37.0
5,170,174,3,172.0,516.0,40.0


In [None]:
dataset['fi'].sum()/2

20.0

In [None]:
fi_2 = dataset['fi'].sum() / 2
fi_2

20.0

In [None]:
limite_inferior, frequencia_classe, id_frequencia_anterior = 0, 0, 0
for linha in dataset.iterrows():
  #print(linha)
  limite_inferior = linha[1][0]
  frequencia_classe = linha[1][2]
  id_frequencia_anterior = linha[0]
  if linha[1][5] >= fi_2:
    id_frequencia_anterior -= 1
    break

In [None]:
limite_inferior, frequencia_classe, id_frequencia_anterior

(158.0, 11.0, 1)

In [None]:
Fi_anterior = dataset.iloc[[id_frequencia_anterior]]['Fi'].values[0]
Fi_anterior

14.0

In [None]:
mediana = limite_inferior + ((fi_2 - Fi_anterior) * 4) / frequencia_classe
mediana

160.1818181818182

### Função completa

In [None]:
def get_estatisticas(dataframe):
  media = dataset['fi.xi'].sum() / dataset['fi'].sum()
  moda = dataset[dataset['fi'] == dataset['fi'].max()]['xi'].values[0]

  fi_2 = dataset['fi'].sum() / 2
  limite_inferior, frequencia_classe, id_frequencia_anterior = 0, 0, 0
  for i, linha in enumerate(dataset.iterrows()):
    limite_inferior = linha[1][0]
    frequencia_classe = linha[1][2]
    id_frequencia_anterior = linha[0]
    if linha[1][5] >= fi_2:
      id_frequencia_anterior -= 1
      break
  Fi_anterior = dataset.iloc[[id_frequencia_anterior]]['Fi'].values[0]
  mediana = limite_inferior + ((fi_2 - Fi_anterior) * 4) / frequencia_classe

  return media, moda, mediana

In [None]:
get_estatisticas(dataset)
# Retorna Média, Moda e Mediana

(160.7, 160.0, 160.1818181818182)

## Média geométrica, harmônica e quadrática

### Média geométrica

In [None]:
dados

array([150, 151, 152, 152, 153, 154, 155, 155, 155, 155, 156, 156, 156,
       157, 158, 158, 160, 160, 160, 160, 160, 161, 161, 161, 161, 162,
       163, 163, 164, 164, 164, 165, 166, 167, 168, 168, 169, 170, 172,
       173])

In [None]:
dados

array([150, 151, 152, 152, 153, 154, 155, 155, 155, 155, 156, 156, 156,
       157, 158, 158, 160, 160, 160, 160, 160, 161, 161, 161, 161, 162,
       163, 163, 164, 164, 164, 165, 166, 167, 168, 168, 169, 170, 172,
       173])

In [None]:
dados.shape

(40,)

In [None]:
from scipy.stats.mstats import gmean

In [None]:
gmean(dados)

160.26958390038905

### Média harmônica

In [None]:
from scipy.stats.mstats import hmean

In [None]:
hmean(dados)

160.16471947994674

### Média quadrática

In [None]:
def quadratic_mean(dados):
  return math.sqrt(sum(n * n for n in dados) / len(dados))

In [None]:
quadratic_mean(dados)

160.48091786876097

## Quartis

In [None]:
dados_impar = [150, 151, 152, 152, 153, 154, 155, 155, 155]

### Cálculo manual

In [None]:
np.median(dados_impar)

153.0

In [None]:
posicao_mediana = math.floor(len(dados_impar) / 2)
posicao_mediana

4

In [None]:
esquerda = dados_impar[0:posicao_mediana]
esquerda

[150, 151, 152, 152]

In [None]:
np.median(esquerda)

151.5

In [None]:
direita = dados_impar[posicao_mediana + 1:]
direita

[154, 155, 155, 155]

In [None]:
np.median(direita)

155.0

### Bibliotecas

#### numpy

In [None]:
np.quantile(dados_impar, 0.5)

153.0

In [None]:
np.quantile(dados_impar, 0.75)

155.0

In [None]:
np.quantile(dados_impar, 0.25)

152.0

In [None]:
esquerda2 = dados_impar[0:posicao_mediana + 1]
esquerda2

[150, 151, 152, 152, 153]

In [None]:
np.median(esquerda2)

152.0

In [None]:
np.quantile(dados, 0.25), np.quantile(dados, 0.50), np.quantile(dados, 0.75)

(155.75, 160.0, 164.0)

#### scipy

In [None]:
stats.scoreatpercentile(dados, 25), stats.scoreatpercentile(dados, 50), stats.scoreatpercentile(dados, 75)

(155.75, 160.0, 164.0)

#### pandas

In [None]:
import pandas as pd
dataset = pd.DataFrame(dados)
dataset.head()

Unnamed: 0,0
0,150
1,151
2,152
3,152
4,153


In [None]:
dados

In [None]:
dataset.head()

Unnamed: 0,0
0,150
1,151
2,152
3,152
4,153


In [None]:
dataset.tail()

Unnamed: 0,0
35,168
36,169
37,170
38,172
39,173


In [None]:
dataset

Unnamed: 0,0
0,150
1,151
2,152
3,152
4,153
5,154
6,155
7,155
8,155
9,155


In [None]:
dataset.describe()

Unnamed: 0,0
count,40.0
mean,160.375
std,5.903877
min,150.0
25%,155.75
50%,160.0
75%,164.0
max,173.0


In [None]:
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,40.0,160.375,5.903877,150.0,155.75,160.0,164.0,173.0


In [None]:
dados

array([150, 151, 152, 152, 153, 154, 155, 155, 155, 155, 156, 156, 156,
       157, 158, 158, 160, 160, 160, 160, 160, 161, 161, 161, 161, 162,
       163, 163, 164, 164, 164, 165, 166, 167, 168, 168, 169, 170, 172,
       173])

In [None]:
dados.shape

(40,)

In [None]:
dataset.quantile([0.25, 0.5, 0.75])

Unnamed: 0,0
0.25,155.75
0.5,160.0
0.75,164.0


In [None]:
dataset.describe()

Unnamed: 0,0
count,40.0
mean,160.375
std,5.903877
min,150.0
25%,155.75
50%,160.0
75%,164.0
max,173.0


## Quartis com distribuição de frequência (dados agrupados)

In [55]:
dados = {'inferior': [150, 154, 158, 162, 166, 170],
         'superior': [154, 158, 162, 166, 170, 174],
         'fi': [5, 9, 11, 7, 5, 3]}

In [58]:
dados = np.array([150, 151, 152, 152, 153, 154, 155, 155, 155, 155, 156, 156, 156,
                  157, 158, 158, 160, 160, 160, 160, 160, 161, 161, 161, 161, 162,
                  163, 163, 164, 164, 164, 165, 166, 167, 168, 168, 169, 170, 172,
                  173])

In [59]:
dados

array([150, 151, 152, 152, 153, 154, 155, 155, 155, 155, 156, 156, 156,
       157, 158, 158, 160, 160, 160, 160, 160, 161, 161, 161, 161, 162,
       163, 163, 164, 164, 164, 165, 166, 167, 168, 168, 169, 170, 172,
       173])

In [60]:
import pandas as pd
dataset = pd.DataFrame(dados)
dataset.head()

Unnamed: 0,0
0,150
1,151
2,152
3,152
4,153


In [61]:
dataset

Unnamed: 0,0
0,150
1,151
2,152
3,152
4,153
5,154
6,155
7,155
8,155
9,155


In [62]:
def get_quartil(dataframe, q1 = True):
  if q1 == True:
    fi_4 = dataset['fi'].sum() / 4
  else:
    fi_4 = (3 * dataset['fi'].sum()) / 4
  
  limite_inferior, frequencia_classe, id_frequencia_anterior = 0, 0, 0
  for linha in dataset.iterrows():
    limite_inferior = linha[1][0]
    frequencia_classe = linha[1][2]
    id_frequencia_anterior = linha[0]
    if linha[1][5] >= fi_4:
      id_frequencia_anterior -= 1
      break
  Fi_anterior = dataset.iloc[[id_frequencia_anterior]]['Fi'].values[0]
  q = limite_inferior + ((fi_4 - Fi_anterior) * 4) / frequencia_classe

  return q

In [63]:
get_quartil(dados), get_quartil(dados, q1 = False)
#get_quartil(dataset), get_quartil(dados, q1 = False)

KeyError: ignored

## Percentis

In [64]:
dados

array([150, 151, 152, 152, 153, 154, 155, 155, 155, 155, 156, 156, 156,
       157, 158, 158, 160, 160, 160, 160, 160, 161, 161, 161, 161, 162,
       163, 163, 164, 164, 164, 165, 166, 167, 168, 168, 169, 170, 172,
       173])

In [65]:
np.median(dados)

160.0

In [66]:
np.quantile(dados, 0.5)

160.0

In [67]:
np.percentile(dados, 50)

160.0

In [68]:
np.percentile(dados, 5), np.percentile(dados, 10), np.percentile(dados, 90)

(151.95000000000002, 152.89999999999998, 168.1)

In [69]:
stats.scoreatpercentile(dados, 5), stats.scoreatpercentile(dados, 10), stats.scoreatpercentile(dados, 90)

(151.95000000000002, 152.89999999999998, 168.1)

In [70]:
import pandas as pd
dataset = pd.DataFrame(dados)
dataset.head()

Unnamed: 0,0
0,150
1,151
2,152
3,152
4,153


In [71]:
dataset.quantile([0.05, 0.10, 0.90])

Unnamed: 0,0
0.05,151.95
0.1,152.9
0.9,168.1


## Exercício

In [27]:
dataset = pd.read_csv('census.csv')

In [28]:
dataset.shape

(8530, 15)

In [75]:
dataset.describe()

Unnamed: 0,age,final-weight,education-num,capital-gain,capital-loos,hour-per-week
count,8530.0,8530.0,8530.0,8530.0,8530.0,8530.0
mean,38.462251,190438.2,10.072567,1054.440094,89.633529,40.509144
std,13.580418,106451.9,2.545174,7269.36192,403.858449,12.27129
min,17.0,19302.0,1.0,0.0,0.0,1.0
25%,28.0,117911.2,9.0,0.0,0.0,40.0
50%,37.0,178629.5,10.0,0.0,0.0,40.0
75%,47.0,238735.5,12.0,0.0,0.0,45.0
max,90.0,1226583.0,16.0,99999.0,3004.0,99.0


In [76]:
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,8530.0,38.462251,13.580418,17.0,28.0,37.0,47.0,90.0
final-weight,8530.0,190438.159086,106451.945872,19302.0,117911.25,178629.5,238735.5,1226583.0
education-num,8530.0,10.072567,2.545174,1.0,9.0,10.0,12.0,16.0
capital-gain,8530.0,1054.440094,7269.36192,0.0,0.0,0.0,0.0,99999.0
capital-loos,8530.0,89.633529,403.858449,0.0,0.0,0.0,0.0,3004.0
hour-per-week,8530.0,40.509144,12.27129,1.0,40.0,40.0,45.0,99.0


In [74]:
dataset.head()

Unnamed: 0,age,workclass,final-weight,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loos,hour-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [78]:
dataset['age'].mean()

38.46225087924971

In [79]:
stats.hmean(dataset['age'])

33.841618675017486

In [80]:
from scipy.stats.mstats import gmean
gmean(dataset['age'])

36.11081431300233

In [81]:
quadratic_mean(dataset['age'])

40.789102457025294

In [82]:
dataset['age'].median()

37.0

In [84]:
statistics.mode(dataset['age'])

31

## Medidas de dispersão

### Amplitude total e diferença interquartil

In [29]:
dados

{'fi': [5, 9, 11, 7, 5, 3],
 'inferior': [150, 154, 158, 162, 166, 170],
 'superior': [154, 158, 162, 166, 170, 174]}

import pandas as pd

dataset = pd.DataFrame(dados)

dataset.head()

In [87]:
import pandas as pd
dataset = pd.DataFrame(dados)

In [88]:
dataset

Unnamed: 0,0
0,150
1,151
2,152
3,152
4,153
5,154
6,155
7,155
8,155
9,155


In [90]:
dados = dataset

In [113]:
dados.head()

Unnamed: 0,0
0,150
1,151
2,152
3,152
4,153


In [91]:
dados.describe()

Unnamed: 0,0
count,40.0
mean,160.375
std,5.903877
min,150.0
25%,155.75
50%,160.0
75%,164.0
max,173.0


In [46]:
dados.min()

150

In [93]:
dados.max()

0    173
dtype: int64

In [92]:
dados.max() - dados.min()

0    23
dtype: int64

In [95]:
q1 = np.quantile(dados, 0.25)
q3 = np.quantile(dados, 0.75)
q1, q3

(155.75, 164.0)

In [97]:
164-155.75

8.25

In [96]:
diferenca_interquartil = q3 - q1
diferenca_interquartil

8.25

In [98]:
inferior = q1 - (1.5 * diferenca_interquartil)
inferior

143.375

In [99]:
superior = q3 + (1.5 * diferenca_interquartil)
superior

176.375

### Variância, desvio padrão e coeficiente de variação

In [6]:
dados_impar = np.array([150, 151, 152, 152, 153, 154, 155, 155, 155])

#### Cálculo manual

In [124]:
# Calculo da Média Aritmética
media = dados_impar.sum() / len(dados_impar)
media

153.0

In [125]:
# Calculo Valor do Desvio =>  Diferença entre o valor da base e a média
desvio = abs(dados_impar - media)
desvio

array([3., 2., 1., 1., 0., 1., 2., 2., 2.])

In [126]:
# Eleva o desvio ao aquadrado
desvio = desvio ** 2
desvio

array([9., 4., 1., 1., 0., 1., 4., 4., 4.])

In [127]:
# SOma o Desvio do Array
soma_desvio = desvio.sum()
soma_desvio

28.0

In [128]:
v = soma_desvio / len(dados_impar)
v

3.111111111111111

In [129]:
dp = math.sqrt(v)
dp

1.7638342073763937

In [130]:
cv = (dp / media) * 100
cv

1.1528328152786886

In [131]:
def get_variancia_desvio_padrao_coeficiente(dataset):
  media = dataset.sum() / len(dataset)
  desvio = abs(dados_impar - media)
  desvio = desvio ** 2
  soma_desvio = desvio.sum()
  variancia = soma_desvio / len(dados_impar)
  dp = math.sqrt(variancia)
  return variancia, dp, (dp / media) * 100

In [133]:
get_variancia_desvio_padrao_coeficiente(dados_impar)
# Variância => Afastamento da Média, Distribuidos no DataSet
# Desvio Padrão => 
# Coeficiente de Variância => 
#

(3.111111111111111, 1.7638342073763937, 1.1528328152786886)

#### Bibliotecas

In [22]:
np.var(dados_impar)

3.111111111111111

In [23]:
np.std(dados_impar)

1.7638342073763937

In [34]:
np.var(dados)

33.984375

In [35]:
np.std(dados)

5.829611908180509

In [36]:
statistics.variance(dados)

34

In [37]:
statistics.stdev(dados)

5.830951894845301

In [38]:
from scipy import ndimage
ndimage.variance(dados)

33.984375

In [39]:
stats.tstd(dados, ddof = 0)

5.829611908180509

In [40]:
stats.variation(dados_impar) * 100

1.1528328152786886

In [41]:
stats.variation(dados) * 100

3.634987939629312

### Desvio padrão com dados agrupados

# Nova secção

In [60]:
dataset.describe()

Unnamed: 0,inferior,superior,fi,xi,xi_2,fi_xi_2
count,6.0,6.0,6.0,6.0,6.0,6.0
mean,160.0,164.0,6.666667,162.0,26290.666667,172381.333333
std,7.483315,7.483315,2.94392,7.483315,2424.988137,71499.813109
min,150.0,154.0,3.0,152.0,23104.0,88752.0
25%,155.0,159.0,5.0,157.0,24652.0,121920.0
50%,160.0,164.0,6.0,162.0,26248.0,164696.0
75%,165.0,169.0,8.5,167.0,27892.0,211336.0
max,170.0,174.0,11.0,172.0,29584.0,281600.0


In [59]:
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
inferior,6.0,160.0,7.483315,150.0,155.0,160.0,165.0,170.0
superior,6.0,164.0,7.483315,154.0,159.0,164.0,169.0,174.0
fi,6.0,6.666667,2.94392,3.0,5.0,6.0,8.5,11.0
xi,6.0,162.0,7.483315,152.0,157.0,162.0,167.0,172.0
xi_2,6.0,26290.666667,2424.988137,23104.0,24652.0,26248.0,27892.0,29584.0
fi_xi_2,6.0,172381.333333,71499.813109,88752.0,121920.0,164696.0,211336.0,281600.0


In [61]:
dataset

Unnamed: 0,inferior,superior,fi,xi,xi_2,fi_xi_2
0,150,154,5,152.0,23104.0,115520.0
1,154,158,9,156.0,24336.0,219024.0
2,158,162,11,160.0,25600.0,281600.0
3,162,166,7,164.0,26896.0,188272.0
4,166,170,5,168.0,28224.0,141120.0
5,170,174,3,172.0,29584.0,88752.0


In [48]:
dataset['xi_2'] = dataset['xi'] * dataset['xi']
dataset

Unnamed: 0,inferior,superior,fi,xi,xi_2
0,150,154,5,152.0,23104.0
1,154,158,9,156.0,24336.0
2,158,162,11,160.0,25600.0
3,162,166,7,164.0,26896.0
4,166,170,5,168.0,28224.0
5,170,174,3,172.0,29584.0


In [49]:
dataset['fi_xi_2'] = dataset['fi'] * dataset['xi_2']
dataset

Unnamed: 0,inferior,superior,fi,xi,xi_2,fi_xi_2
0,150,154,5,152.0,23104.0,115520.0
1,154,158,9,156.0,24336.0,219024.0
2,158,162,11,160.0,25600.0,281600.0
3,162,166,7,164.0,26896.0,188272.0
4,166,170,5,168.0,28224.0,141120.0
5,170,174,3,172.0,29584.0,88752.0


In [63]:
dataset.columns

Index(['inferior', 'superior', 'fi', 'xi', 'xi_2', 'fi_xi_2'], dtype='object')

In [51]:
colunas_ordenadas = ['inferior', 'superior', 'fi', 'xi', 'fi.xi', 'xi_2', 'fi_xi_2', 'Fi']

In [54]:
colunas_ordenadas

['inferior', 'superior', 'fi', 'xi', 'fi.xi', 'xi_2', 'fi_xi_2', 'Fi']

In [53]:
dataset

Unnamed: 0,inferior,superior,fi,xi,xi_2,fi_xi_2
0,150,154,5,152.0,23104.0,115520.0
1,154,158,9,156.0,24336.0,219024.0
2,158,162,11,160.0,25600.0,281600.0
3,162,166,7,164.0,26896.0,188272.0
4,166,170,5,168.0,28224.0,141120.0
5,170,174,3,172.0,29584.0,88752.0


In [62]:
dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
inferior,6.0,160.0,7.483315,150.0,155.0,160.0,165.0,170.0
superior,6.0,164.0,7.483315,154.0,159.0,164.0,169.0,174.0
fi,6.0,6.666667,2.94392,3.0,5.0,6.0,8.5,11.0
xi,6.0,162.0,7.483315,152.0,157.0,162.0,167.0,172.0
xi_2,6.0,26290.666667,2424.988137,23104.0,24652.0,26248.0,27892.0,29584.0
fi_xi_2,6.0,172381.333333,71499.813109,88752.0,121920.0,164696.0,211336.0,281600.0


In [57]:
dataset[colunas_ordenadas]

KeyError: ignored

In [56]:
dataset = dataset[colunas_ordenadas]
dataset

KeyError: ignored

In [None]:
dp = math.sqrt(dataset['fi_xi_2'].sum() / dataset['fi'].sum() - math.pow(dataset['fi.xi'].sum() / dataset['fi'].sum(), 2))
dp

5.719265687131764

## Testes com algoritmos de classificação

In [91]:
import pandas as pd
dataset = pd.read_csv('credit_data.csv')

In [69]:
dataset.shape

(2000, 5)

In [92]:
# Apagar linhas nulas
dataset.dropna(inplace=True)
dataset.shape

(1997, 5)

In [93]:
dataset

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.642260,0
4,5,66952.688845,18.584336,8770.099235,1
...,...,...,...,...,...
1995,1996,59221.044874,48.518179,1926.729397,0
1996,1997,69516.127573,23.162104,3503.176156,0
1997,1998,44311.449262,28.017167,5522.786693,1
1998,1999,43756.056605,63.971796,1622.722598,0


In [94]:
X = dataset.iloc[:, 1:4].values
X

array([[6.61559251e+04, 5.90170151e+01, 8.10653213e+03],
       [3.44151540e+04, 4.81171531e+01, 6.56474502e+03],
       [5.73171701e+04, 6.31080495e+01, 8.02095330e+03],
       ...,
       [4.43114493e+04, 2.80171669e+01, 5.52278669e+03],
       [4.37560566e+04, 6.39717958e+01, 1.62272260e+03],
       [6.94365796e+04, 5.61526170e+01, 7.37883360e+03]])

In [95]:
y = dataset.iloc[:, 4].values
y

array([0, 0, 0, ..., 1, 0, 0])

In [96]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [97]:
resultados_naive_bayes = []
resultados_logistica = []
resultados_forest = []
for i in range(30):
  X_treinamento, X_teste, y_treinamento, y_teste = train_test_split(X, y, test_size = 0.2,
                                                                    stratify = y, random_state = i)
  naive_bayes = GaussianNB()
  naive_bayes.fit(X_treinamento, y_treinamento)
  resultados_naive_bayes.append(accuracy_score(y_teste, naive_bayes.predict(X_teste)))

  logistica = LogisticRegression()
  logistica.fit(X_treinamento, y_treinamento)
  resultados_logistica.append(accuracy_score(y_teste, logistica.predict(X_teste)))

  random_forest = RandomForestClassifier()
  random_forest.fit(X_treinamento, y_treinamento)
  resultados_forest.append(accuracy_score(y_teste, random_forest.predict(X_teste)))

In [98]:
print(resultados_naive_bayes)

[0.925, 0.925, 0.9325, 0.925, 0.92, 0.905, 0.9175, 0.9175, 0.9125, 0.9325, 0.9225, 0.9125, 0.935, 0.9175, 0.925, 0.9175, 0.9275, 0.92, 0.9325, 0.92, 0.93, 0.905, 0.9175, 0.9325, 0.9425, 0.9375, 0.94, 0.92, 0.935, 0.925]


In [99]:
print(resultados_logistica)

[0.9325, 0.91, 0.9125, 0.9225, 0.9075, 0.89, 0.91, 0.9075, 0.8775, 0.915, 0.9175, 0.9, 0.925, 0.9175, 0.9025, 0.9125, 0.9525, 0.91, 0.9225, 0.9075, 0.925, 0.905, 0.9075, 0.945, 0.9225, 0.9275, 0.9225, 0.9175, 0.91, 0.9]


In [100]:
print(resultados_forest)

[0.975, 0.99, 0.9825, 0.9925, 0.98, 0.9875, 0.9875, 0.985, 0.9725, 0.9825, 0.9825, 0.975, 0.9725, 0.9875, 0.98, 0.975, 0.9875, 0.98, 0.9925, 0.9875, 0.9825, 0.98, 0.985, 0.99, 0.9825, 0.995, 0.9925, 0.9825, 0.99, 0.985]


In [101]:
type(resultados_naive_bayes)

list

In [102]:
resultados_naive_bayes = np.array(resultados_naive_bayes)
resultados_logistica = np.array(resultados_logistica)
resultados_forest = np.array(resultados_forest)

In [103]:
type(resultados_naive_bayes)

numpy.ndarray

### Média

In [104]:
resultados_naive_bayes.mean(), resultados_logistica.mean(), resultados_forest.mean()

(0.92425, 0.9145, 0.9840000000000001)

### Moda

In [105]:
statistics.mode(resultados_naive_bayes)

StatisticsError: ignored

In [106]:
stats.mode(resultados_naive_bayes), stats.mode(resultados_logistica), stats.mode(resultados_forest)

(ModeResult(mode=array([0.9175]), count=array([5])),
 ModeResult(mode=array([0.9075]), count=array([4])),
 ModeResult(mode=array([0.9825]), count=array([6])))

### Mediana

In [107]:
np.median(resultados_naive_bayes), np.median(resultados_logistica), np.median(resultados_forest)

(0.925, 0.9125, 0.98375)

### Variância

In [108]:
np.set_printoptions(suppress=True)
np.var(resultados_naive_bayes), np.var(resultados_logistica), np.var(resultados_forest)

(8.756250000000001e-05, 0.00020933333333333337, 3.65000000000001e-05)

In [109]:
np.min([8.756250000000001e-05, 0.00020933333333333337, 2.9229166666666637e-05])

2.9229166666666637e-05

In [110]:
np.max([8.756250000000001e-05, 0.00020933333333333337, 2.9229166666666637e-05])

0.00020933333333333337

In [111]:
resultados_forest

array([0.975 , 0.99  , 0.9825, 0.9925, 0.98  , 0.9875, 0.9875, 0.985 ,
       0.9725, 0.9825, 0.9825, 0.975 , 0.9725, 0.9875, 0.98  , 0.975 ,
       0.9875, 0.98  , 0.9925, 0.9875, 0.9825, 0.98  , 0.985 , 0.99  ,
       0.9825, 0.995 , 0.9925, 0.9825, 0.99  , 0.985 ])

### Desvio padrão

In [112]:
np.std(resultados_naive_bayes), np.std(resultados_logistica), np.std(resultados_forest)

(0.00935748363610645, 0.014468356276140472, 0.006041522986797295)

### Coeficiente de variação

In [113]:
stats.variation(resultados_naive_bayes) * 100, stats.variation(resultados_logistica) * 100, stats.variation(resultados_forest) * 100

(1.0124407504578252, 1.5821056616884057, 0.613975913292408)

### Exercício: validação cruzada

In [114]:
from sklearn.model_selection import cross_val_score, KFold

In [115]:
resultados_naive_bayes_cv = []
resultados_logistica_cv = []
resultados_forest_cv = []
for i in range(30):
  kfold = KFold(n_splits = 10, shuffle = True, random_state = i)
  
  naive_bayes = GaussianNB()
  scores = cross_val_score(naive_bayes, X, y, cv = kfold)
  resultados_naive_bayes_cv.append(scores.mean())

  logistica = LogisticRegression()
  scores = cross_val_score(logistica, X, y, cv = kfold)
  resultados_logistica_cv.append(scores.mean())
  
  random_forest = RandomForestClassifier()
  scores = cross_val_score(random_forest, X, y, cv = kfold)
  resultados_forest_cv.append(scores.mean())

In [116]:
scores, 10 * 30

(array([0.995     , 0.98      , 0.985     , 0.99      , 0.975     ,
        0.99      , 0.985     , 0.98492462, 0.98994975, 1.        ]), 300)

In [117]:
scores.mean()

0.9874874371859297

In [118]:
print(resultados_naive_bayes_cv)

[0.9248618090452261, 0.9249170854271356, 0.9248894472361808, 0.9228819095477387, 0.9248718592964824, 0.9253894472361809, 0.9243844221105528, 0.9248894472361808, 0.9248718592964824, 0.9238894472361808, 0.9253844221105527, 0.9244020100502512, 0.9253743718592965, 0.924394472361809, 0.9253869346733667, 0.9248819095477387, 0.9258844221105527, 0.924894472361809, 0.9238819095477387, 0.9258844221105529, 0.9258944723618091, 0.9253894472361811, 0.92436432160804, 0.9263844221105527, 0.9228944723618092, 0.9253894472361809, 0.9248869346733668, 0.9253919597989949, 0.9253819095477386, 0.9258844221105527]


In [119]:
print(resultados_logistica_cv)

[0.9113718592964826, 0.9103668341708543, 0.918359296482412, 0.9158768844221106, 0.9133417085427136, 0.9108718592964824, 0.9188366834170856, 0.9188969849246231, 0.913356783919598, 0.9148693467336683, 0.9098718592964824, 0.9183542713567838, 0.9183793969849248, 0.9213618090452261, 0.9098643216080402, 0.914356783919598, 0.9103693467336684, 0.915856783919598, 0.9128542713567839, 0.9123668341708543, 0.9208944723618091, 0.9108668341708542, 0.9103467336683417, 0.9133768844221105, 0.9098592964824121, 0.9148768844221105, 0.910859296482412, 0.9113768844221104, 0.9113668341708543, 0.9098442211055277]


In [120]:
print(resultados_forest_cv)

[0.985467336683417, 0.9869824120603015, 0.9834773869346733, 0.9864673366834171, 0.9874798994974874, 0.9869849246231157, 0.9879748743718594, 0.9869874371859296, 0.9869798994974875, 0.9854849246231157, 0.9884798994974874, 0.9879748743718594, 0.9874773869346735, 0.9864798994974875, 0.9874874371859296, 0.9894849246231155, 0.9894899497487437, 0.9879723618090454, 0.9864798994974875, 0.9854824120603016, 0.9854748743718593, 0.9864798994974875, 0.9859824120603016, 0.9874798994974874, 0.9859798994974875, 0.9854849246231157, 0.9874773869346735, 0.9884798994974874, 0.9884874371859297, 0.9874874371859297]


In [121]:
stats.variation(resultados_naive_bayes) * 100, stats.variation(resultados_logistica) * 100, stats.variation(resultados_forest) * 100

(1.0124407504578252, 1.5821056616884057, 0.613975913292408)

In [122]:
stats.variation(resultados_naive_bayes_cv) * 100, stats.variation(resultados_logistica_cv) * 100, stats.variation(resultados_forest_cv) * 100

(0.08641071566366061, 0.38801026116292653, 0.1312640530800689)

### Seleção de atributos utilizando variância

In [123]:
np.random.rand(50)

array([0.65089933, 0.97524389, 0.22036995, 0.52937323, 0.28797733,
       0.40667784, 0.75146686, 0.16761304, 0.25555053, 0.9632511 ,
       0.94653784, 0.63192076, 0.04574021, 0.84770096, 0.78509091,
       0.20742577, 0.94498385, 0.54260419, 0.28071127, 0.69191985,
       0.84356702, 0.45463386, 0.62625953, 0.34367295, 0.12541984,
       0.39607398, 0.73277582, 0.50988641, 0.82632645, 0.90444474,
       0.16320791, 0.2760491 , 0.96002486, 0.16681834, 0.28433572,
       0.40426574, 0.10880274, 0.74821007, 0.110381  , 0.93887486,
       0.6314227 , 0.88771635, 0.31336585, 0.37066179, 0.29913121,
       0.53099096, 0.69313654, 0.71930304, 0.1243369 , 0.04399864])

In [124]:
np.random.randint(0, 2)

1

In [125]:
base_selecao = {'a': np.random.rand(20),
                'b': np.array([0.5] * 20),
                'classe': np.random.randint(0, 2, size = 20)}

In [126]:
base_selecao

{'a': array([0.07231497, 0.47453512, 0.43165631, 0.15312102, 0.91040108,
        0.40697028, 0.41387193, 0.73641596, 0.08615674, 0.83900009,
        0.54445683, 0.45771785, 0.54007235, 0.30885617, 0.23740864,
        0.82235157, 0.25306885, 0.602303  , 0.09217774, 0.1713564 ]),
 'b': array([0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
        0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]),
 'classe': array([1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0])}

In [127]:
dataset = pd.DataFrame(base_selecao)
dataset.head()

Unnamed: 0,a,b,classe
0,0.072315,0.5,1
1,0.474535,0.5,1
2,0.431656,0.5,1
3,0.153121,0.5,1
4,0.910401,0.5,1


In [128]:
dataset.describe()

Unnamed: 0,a,b,classe
count,20.0,20.0,20.0
mean,0.427711,0.5,0.65
std,0.260684,0.0,0.48936
min,0.072315,0.5,0.0
25%,0.220896,0.5,0.0
50%,0.422764,0.5,1.0
75%,0.558918,0.5,1.0
max,0.910401,0.5,1.0


In [129]:
math.sqrt(0.08505323963215053)

0.2916388856653902

In [130]:
np.var(dataset['a']), np.var(dataset['b'])

(0.06455812287766798, 0.0)

In [131]:
X = dataset.iloc[:, 0:2].values
X

array([[0.07231497, 0.5       ],
       [0.47453512, 0.5       ],
       [0.43165631, 0.5       ],
       [0.15312102, 0.5       ],
       [0.91040108, 0.5       ],
       [0.40697028, 0.5       ],
       [0.41387193, 0.5       ],
       [0.73641596, 0.5       ],
       [0.08615674, 0.5       ],
       [0.83900009, 0.5       ],
       [0.54445683, 0.5       ],
       [0.45771785, 0.5       ],
       [0.54007235, 0.5       ],
       [0.30885617, 0.5       ],
       [0.23740864, 0.5       ],
       [0.82235157, 0.5       ],
       [0.25306885, 0.5       ],
       [0.602303  , 0.5       ],
       [0.09217774, 0.5       ],
       [0.1713564 , 0.5       ]])

In [132]:
from sklearn.feature_selection import VarianceThreshold

In [133]:
selecao = VarianceThreshold(threshold=0.07)
X_novo = selecao.fit_transform(X)

ValueError: ignored

In [134]:
X_novo, X_novo.shape

NameError: ignored

In [135]:
selecao.variances_

array([0.06455812, 0.        ])

In [136]:
indices = np.where(selecao.variances_ > 0.07)
indices

(array([], dtype=int64),)

#### Exercício seleção de atributos utilizando variância

In [137]:
dataset = pd.read_csv('credit_data.csv')

In [138]:
dataset.dropna(inplace=True)

In [139]:
dataset.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [140]:
dataset.describe()

Unnamed: 0,i#clientid,income,age,loan,c#default
count,1997.0,1997.0,1997.0,1997.0,1997.0
mean,1001.956935,45333.864334,40.807559,4445.487716,0.141713
std,576.702206,14325.131177,13.624469,3046.792457,0.348842
min,1.0,20014.48947,-52.42328,1.37763,0.0
25%,503.0,32804.904487,28.990415,1936.813257,0.0
50%,1002.0,45788.7471,41.317159,3977.287432,0.0
75%,1501.0,57787.565659,52.58704,6440.861434,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


In [141]:
X = dataset.iloc[:, 1:4].values
X

array([[66155.92509508,    59.01701507,  8106.53213129],
       [34415.15396582,    48.1171531 ,  6564.74501768],
       [57317.17006303,    63.10804949,  8020.95329639],
       ...,
       [44311.44926231,    28.0171669 ,  5522.78669326],
       [43756.05660491,    63.97179584,  1622.72259832],
       [69436.57955155,    56.15261703,  7378.83359873]])

In [142]:
y = dataset.iloc[:, 4].values
y

array([0, 0, 0, ..., 1, 0, 0])

In [143]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [144]:
X

array([[0.9231759 , 0.95743135, 0.58883739],
       [0.28812165, 0.86378597, 0.47682695],
       [0.74633429, 0.99257918, 0.58262011],
       ...,
       [0.48612202, 0.69109837, 0.40112895],
       [0.47500998, 1.        , 0.1177903 ],
       [0.98881367, 0.93282208, 0.53597028]])

In [145]:
selecao = VarianceThreshold(threshold=0.027)
X_novo = selecao.fit_transform(X)

In [146]:
X_novo

array([[0.9231759 , 0.58883739],
       [0.28812165, 0.47682695],
       [0.74633429, 0.58262011],
       ...,
       [0.48612202, 0.40112895],
       [0.47500998, 0.1177903 ],
       [0.98881367, 0.53597028]])

In [147]:
np.var(X[0]), np.var(X[1]), np.var(X[2])

(0.027646353650092215, 0.057415153407223446, 0.02838948027619903)

In [148]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
naive_sem_selecao = GaussianNB()
naive_sem_selecao.fit(X, y)
previsoes = naive_sem_selecao.predict(X)
accuracy_score(previsoes, y)

0.9253880821231848

In [149]:
naive_com_selecao = GaussianNB()
naive_com_selecao.fit(X_novo, y)
previsoes = naive_com_selecao.predict(X_novo)
accuracy_score(previsoes, y)

0.8472709063595393

## Valores faltantes com média e moda

### Média

In [150]:
import pandas as pd
dataset = pd.read_csv('credit_data.csv')

In [151]:
dataset.isnull().sum()

i#clientid    0
income        0
age           3
loan          0
c#default     0
dtype: int64

In [152]:
nulos = dataset[dataset.isnull().any(axis=1)]
nulos

Unnamed: 0,i#clientid,income,age,loan,c#default
28,29,59417.805406,,2082.625938,0
30,31,48528.852796,,6155.78467,0
31,32,23526.302555,,2862.010139,0


In [153]:
dataset['age'].mean(), dataset['age'].median()

(40.80755937840458, 41.3171591130085)

In [154]:
dataset['age'] = dataset['age'].replace(to_replace = np.nan, value = dataset['age'].mean())

In [155]:
dataset[dataset.isnull().any(axis=1)]

Unnamed: 0,i#clientid,income,age,loan,c#default


### Moda

In [156]:
#dataset = pd.read_csv('autos.csv', encoding='ISO-8859-1')
dataset = pd.read_csv('autos.csv')


FileNotFoundError: ignored

In [157]:
dataset.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [158]:
dataset.isnull().sum()

i#clientid    0
income        0
age           0
loan          0
c#default     0
dtype: int64

In [159]:
dataset['fuelType'].unique()

KeyError: ignored

In [160]:
stats.mode(dataset['fuelType'])

KeyError: ignored

In [161]:
statistics.mode(dataset['fuelType'])

KeyError: ignored

In [162]:
dataset['fuelType'] = dataset['fuelType'].replace(to_replace = np.nan, value = statistics.mode(dataset['fuelType']))

KeyError: ignored

In [163]:
dataset['fuelType'].unique()

KeyError: ignored