<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/research/AAAMLP_Notes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! wget https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv

In [33]:
import pandas as pd
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold

In [3]:
dataset = pd.read_csv("winequality-red.csv", sep=";")  

In [4]:
dataset.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
quality_mapping = {
    3: 0,
    4: 1,
    5: 2,
    6: 3,
    7: 4,
    8: 5,
}
dataset["quality"] = dataset.quality.map(quality_mapping)

In [6]:
dataset = dataset.sample(frac=1).reset_index(drop=True) # Randomize the df

In [7]:
dataset_train = dataset.head(1000)
dataset_valid = dataset.tail(599)

In [8]:
columns = list(dataset.columns)
columns.remove("quality")
columns

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol']

In [9]:
classifier = DecisionTreeClassifier(max_depth=3)

In [10]:
classifier.fit(dataset_train[columns], dataset_train.quality)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=3, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [11]:
train_predictions = classifier.predict(dataset_train[columns])
train_accuracy = accuracy_score(dataset_train.quality, train_predictions)
train_accuracy

0.633

In [12]:
valid_predictions = classifier.predict(dataset_valid[columns])
valid_accuracy = accuracy_score(dataset_valid.quality, valid_predictions)
valid_accuracy

0.5242070116861436

In [13]:
# Changing the depth of tree classifier to max_depth=7
classifier = DecisionTreeClassifier(max_depth=7)
classifier.fit(dataset_train[columns], dataset_train.quality)

train_predictions = classifier.predict(dataset_train[columns])
train_accuracy = accuracy_score(dataset_train.quality, train_predictions)

valid_predictions = classifier.predict(dataset_valid[columns])
valid_accuracy = accuracy_score(dataset_valid.quality, valid_predictions)

print(f"Train Accuracy: {train_accuracy: .3f} || Valid Accuracy: {valid_accuracy: .3f}")

Train Accuracy:  0.808 || Valid Accuracy:  0.536


In [14]:
# Looking at the train and Test Accuracy for depth=1 to depth=25

for i in range(1, 26):
  classifier = DecisionTreeClassifier(max_depth=i)
  classifier.fit(dataset_train[columns], dataset_train.quality)

  train_predictions = classifier.predict(dataset_train[columns])
  train_accuracy = accuracy_score(dataset_train.quality, train_predictions)

  valid_predictions = classifier.predict(dataset_valid[columns])
  valid_accuracy = accuracy_score(dataset_valid.quality, valid_predictions)

  print(f"Train Accuracy: {train_accuracy: .3f} || Valid Accuracy: {valid_accuracy: .3f}")

Train Accuracy:  0.568 || Valid Accuracy:  0.541
Train Accuracy:  0.569 || Valid Accuracy:  0.533
Train Accuracy:  0.633 || Valid Accuracy:  0.524
Train Accuracy:  0.666 || Valid Accuracy:  0.559
Train Accuracy:  0.711 || Valid Accuracy:  0.528
Train Accuracy:  0.757 || Valid Accuracy:  0.548
Train Accuracy:  0.808 || Valid Accuracy:  0.544
Train Accuracy:  0.857 || Valid Accuracy:  0.541
Train Accuracy:  0.891 || Valid Accuracy:  0.531
Train Accuracy:  0.925 || Valid Accuracy:  0.539
Train Accuracy:  0.948 || Valid Accuracy:  0.546
Train Accuracy:  0.960 || Valid Accuracy:  0.538
Train Accuracy:  0.977 || Valid Accuracy:  0.553
Train Accuracy:  0.985 || Valid Accuracy:  0.539
Train Accuracy:  0.993 || Valid Accuracy:  0.543
Train Accuracy:  0.997 || Valid Accuracy:  0.553
Train Accuracy:  0.998 || Valid Accuracy:  0.563
Train Accuracy:  0.999 || Valid Accuracy:  0.543
Train Accuracy:  1.000 || Valid Accuracy:  0.556
Train Accuracy:  1.000 || Valid Accuracy:  0.554
Train Accuracy:  1.0

In [18]:
# Calculting the distribution of label values
distribution_labels = [0] * 6
labels = dataset.quality
for label in labels:
  distribution_labels[label] += 1
print(distribution_labels)
# Highly skewed dataset favouring some labels --> Using stratified k-fold

[10, 53, 681, 638, 199, 18]


In [22]:
dataset_skfold = dataset
dataset_skfold["kfold"] = -1
dataset_skfold.sample(frac=1).reset_index(drop=True)
y = dataset_skfold.quality

kf = StratifiedKFold(n_splits=5)

for fold, (train_index, valid_index) in enumerate(kf.split(X=dataset_skfold, y=y)):
  dataset_skfold.loc[valid_index, 'kfold'] = fold

In [30]:
# For fold=0
distribution_labels = [0] * 6
labels = dataset_skfold[dataset_skfold.kfold == 0].quality
for label in labels:
  distribution_labels[label] += 1
print("For fold=0")
print(distribution_labels)

# For fold=1
distribution_labels = [0] * 6
labels = dataset_skfold[dataset_skfold.kfold == 1].quality
for label in labels:
  distribution_labels[label] += 1
print("For fold=1")
print(distribution_labels)

For fold=0
[2, 11, 136, 128, 40, 3]
For fold=1
[2, 10, 136, 128, 40, 4]


In [32]:
# Trying out accuracy between different folds
for fold in range(5):
  dataset_skfold_train = dataset_skfold[dataset_skfold.kfold != fold]
  dataset_skfold_valid = dataset_skfold[dataset_skfold.kfold == fold]

  classifier = DecisionTreeClassifier(max_depth=3)
  classifier.fit(dataset_skfold_train[columns], dataset_skfold_train.quality)

  train_predictions = classifier.predict(dataset_skfold_train[columns])
  valid_predictions = classifier.predict(dataset_skfold_valid[columns])

  train_accuracy = accuracy_score(dataset_skfold_train.quality, train_predictions)
  valid_accuracy = accuracy_score(dataset_skfold_valid.quality, valid_predictions)
  
  print(f"Train Accuracy: {train_accuracy: .3f} || Valid Accuracy: {valid_accuracy: .3f}\n")

Train Accuracy:  0.576 || Valid Accuracy:  0.556

Train Accuracy:  0.595 || Valid Accuracy:  0.547

Train Accuracy:  0.590 || Valid Accuracy:  0.566

Train Accuracy:  0.616 || Valid Accuracy:  0.497

Train Accuracy:  0.580 || Valid Accuracy:  0.536



In [None]:
# Making bins for regression dataset
def create_folds_regression(dataset):
  dataset["kfold"] = -1
  
  dataset.sample(frac=1).reset_index(drop=True)

  num_bins = int(np.floor(1 + np.log2(len(dataset))))
  pass

X, y = datasets.make_regression(n_samples=150000, n_features=100, n_targets=1)
dataset = pd.DataFrame(X, columns=[f"f_{i}" for i in range(X.shape[-1])])
dataset["targets"] = y

dataset = create_folds_regression(dataset)