#  Classificação KNN

O KNN é um dos algoritmos mais simples para Machine Learning, sendo um algoritmo do tipo "lazy", ou seja, nenhuma computação é realizada no dataset até que um novo ponto de dado seja alvo de teste.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_excel("logi1.xlsx")
df.head()

In [None]:
plt.scatter(df.index, df['Score'], c = df['Accepted'].astype('category').cat.codes)
plt.xlabel('Provas Alunos')
plt.ylabel('Score')
plt.show()

### Preparação do dados

In [None]:
X = df["Score"].values.reshape(-1, 1)
y = df["Accepted"]

In [None]:
print(X[:3])
print(y[:3])

### Aplicacao do modelo

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
modelo = KNeighborsClassifier(n_neighbors=5)

In [None]:
modelo.fit(X,y)

### uso do modelo

#### modelo.predict()  retornará 1 ou 0 – (Accepted or Not)

In [None]:
modelo.predict(1200)

In [None]:
modelo.predict_proba(1200) # (40% de chance de não ser Aceito(0), 60% chance de ser  Aceito(1))

In [None]:
modelo.predict(1400)

In [None]:
modelo.predict(550)

In [None]:
modelo.predict_proba(1400)

In [None]:
modelo.predict_proba(550)

## *** KNN - dados do Titanic***

In [None]:
df = pd.read_csv("titanic_train.csv")
df.head()

In [None]:
df['fsex'] = df.apply(lambda row: 0 if row['Sex'] == "male" else 1, axis=1)
df.head()

### pclass- Ticket class 1 = 1st, 2 = 2nd, 3 = 3rd
### Survived: 0 = No, 1 = Yes

In [None]:
print("Número de passageiros= ",len(df))

In [None]:
plt.scatter(   df.index, df.Age, c = df['Survived'].astype('category').cat.codes);
plt.xlabel('# passagers')
plt.ylabel('Age');
plt.show()

In [None]:
X = df["Pclass"].values.reshape(-1, 1)
y = df["Survived"]

In [None]:
modelo = KNeighborsClassifier(n_neighbors=5)

In [None]:
modelo.fit(X,y)

In [None]:
modelo.predict(2)

In [None]:
modelo.predict_proba(2)

In [None]:
# modelo 2 - com idade
import numpy as np

In [None]:
# remover valores missing de idade- nan
df = df[df.Age != np.NaN]
df.head()

In [None]:
len(df)

In [None]:
# remover valores missing - nan
df = df.dropna(axis=0, how='any')
len(df)

In [None]:
X = df["Age"].values.reshape(-1, 1)
y = df["Survived"]

In [None]:
modelo = KNeighborsClassifier(n_neighbors=5)
modelo.fit(X,y)

In [None]:
modelo.predict(30)

In [None]:
modelo.predict_proba(30)

In [None]:
modelo.predict_proba(10)

In [None]:
modelo.predict(50)

In [None]:
modelo.predict_proba(50)

In [None]:
modelo.predict_proba(60)

In [None]:
# modelo 3 - fare (valor do ticket)

In [None]:
df = pd.read_csv("titanic_train.csv")
df.head()

In [None]:
df.describe()

In [None]:
X = df["Fare"].values.reshape(-1, 1)
y = df["Survived"]

In [None]:
modelo = KNeighborsClassifier(n_neighbors=5)
modelo.fit(X,y)

In [None]:
modelo.predict(10)

In [None]:
modelo.predict_proba(10)

In [None]:
valor = 200 # 56
print(modelo.predict(valor), modelo.predict_proba(valor))

### fim

## Normalizar os dados (Usar mais de uma variável preditora)

In [None]:
df = pd.read_excel("dados.xlsx")
df

### Normalizacao de dados - forma 1

In [None]:
df_norm = (df - df.mean()) / (df.max() - df.min())
df_norm

### Normalizacao de dados - forma 2

In [None]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(df)
df_normalized = pd.DataFrame(np_scaled)
df_normalized.columns = ["Altura", "Peso", "Salario"]
df_normalized

## fim

In [None]:
from sklearn import datasets
from sklearn.cross_validation import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
   # setup marker generator and color map
   markers = ('s', 'x', 'o', '^', 'v')
   colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
   cmap = ListedColormap(colors[:len(np.unique(y))])

   # plot the decision surface
   x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
   x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
   xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
   np.arange(x2_min, x2_max, resolution))
   Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
   Z = Z.reshape(xx1.shape)
   plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
   plt.xlim(xx1.min(), xx1.max())
   plt.ylim(xx2.min(), xx2.max())

   # plot all samples
   X_test, y_test = X[test_idx, :], y[test_idx]
   for idx, cl in enumerate(np.unique(y)):
      plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
               alpha=0.8, c=cmap(idx),
               marker=markers[idx], label=cl)
   # highlight test samples
   if test_idx:
      X_test, y_test = X[test_idx, :], y[test_idx]
      plt.scatter(X_test[:, 0], X_test[:, 1], c='',
               alpha=1.0, linewidth=1, marker='o',
               s=55, label='test set')

iris = datasets.load_iris()
X = iris.data[:, [2, 3]]
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.fit_transform(X_test)

X_combined_std = np.vstack((X_train_std, X_test_std))
y_combined = np.hstack((y_train, y_test))

knn = KNeighborsClassifier(n_neighbors=5, p=2,
                           metric='minkowski')
knn.fit(X_train_std, y_train)

plot_decision_regions(X_combined_std, y_combined,
                      classifier=knn, test_idx=range(105,150))

plt.xlabel('petal length [cm]')
plt.ylabel('petal width [cm]')
plt.legend(loc='upper left')
plt.show()

In [None]:
%matplotlib inline
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression

In [None]:
iris = pd.read_csv('Iris.csv')

In [None]:
iris

In [None]:
iris['Species'] = iris['Species'].astype('category')

In [None]:
plt.scatter(iris['SepalLengthCm'], iris['SepalWidthCm'], c = iris['Species'].astype('category').cat.codes);
plt.xlabel('Sepal length (cm)')
plt.ylabel('Sepal width (cm)');

In [None]:
plt.scatter(iris['PetalLengthCm'], iris['PetalWidthCm'], c = iris['Species'].astype('category').cat.codes)
plt.xlabel('Petal Length (cm)')
plt.ylabel('Petal Width (cm)');

In [None]:
model = LogisticRegression()

In [None]:
mask = np.random.rand(len(iris)) <= 0.8

train = iris[mask]
test = iris[~mask]

In [None]:
X_train = train.drop('Species', axis = 1).drop('Id', axis = 1)
y_train = train['Species']

In [None]:
X_test = test.drop('Species', axis = 1).drop('Id', axis = 1)
y_test = test['Species']

In [None]:
X_train

In [None]:
model.fit(X_train, y_train); 

In [None]:
prediction = model.predict(X_test)

In [None]:
import sklearn.metrics as metrics

metrics.confusion_matrix(y_true = y_test, y_pred = prediction)

In [None]:
metrics.accuracy_score(y_true = y_test, y_pred = prediction)


In [None]:
len(X_train)

In [None]:
model.coef_

In [None]:
model.intercept_