<a href="https://colab.research.google.com/github/yeamuhid/mashing-learning/blob/main/training_and_testing_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Example dataset
# Create a sample dataset with features and labels
data = {
    'Feature1': [10, 20, 30, 40, 50],
    'Feature2': [15, 25, 35, 45, 55],
    'Label': [0, 1, 0, 1, 0]
}

# Convert the dataset into a Pandas DataFrame
df = pd.DataFrame(data)

# Separate features (X) and labels (y)
X = df[['Feature1', 'Feature2']]  # Features
y = df['Label']                  # Target/Label

# Split the dataset into training and testing sets
# 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the results
print("Training Features:\n", X_train)
print("\nTesting Features:\n", X_test)
print("\nTraining Labels:\n", y_train)
print("\nTesting Labels:\n", y_test)


Training Features:
    Feature1  Feature2
4        50        55
2        30        35
0        10        15
3        40        45

Testing Features:
    Feature1  Feature2
1        20        25

Training Labels:
 4    0
2    0
0    0
3    1
Name: Label, dtype: int64

Testing Labels:
 1    1
Name: Label, dtype: int64


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.datasets import make_classification

# Generate a synthetic dataset for demonstration
X, y = make_classification(
    n_samples=1000,  # Number of samples
    n_features=10,   # Number of features
    n_informative=8, # Number of informative features
    n_redundant=2,   # Number of redundant features
    n_classes=2,     # Number of target classes
    weights=[0.7, 0.3], # Class imbalance
    random_state=42
)

# Convert to DataFrame for easier handling
data = pd.DataFrame(X, columns=[f"Feature_{i}" for i in range(X.shape[1])])
data['Label'] = y

# 1. Split the dataset into training and testing sets with stratification
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(columns='Label'),  # Features
    data['Label'],              # Target
    test_size=0.2,              # 20% for testing
    stratify=data['Label'],     # Maintain class distribution
    random_state=42             # Reproducibility
)

# 2. Perform K-Fold Cross-Validation on the training data
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
for train_idx, val_idx in skf.split(X_train, y_train):
    print(f"Fold {fold}:")
    print("Training indices:", train_idx[:10], "...")
    print("Validation indices:", val_idx[:10], "...")
    fold += 1

# 3. Summary of splits
print("\nTraining set size:", X_train.shape)
print("Testing set size:", X_test.shape)
print("Class distribution in training set:", y_train.value_counts())
print("Class distribution in testing set:", y_test.value_counts())


Fold 1:
Training indices: [ 0  1  3  5  6  7  8  9 11 12] ...
Validation indices: [ 2  4 10 18 27 28 29 33 43 50] ...
Fold 2:
Training indices: [ 1  2  3  4  5  6  7  8  9 10] ...
Validation indices: [ 0 11 14 15 16 17 20 21 22 24] ...
Fold 3:
Training indices: [ 0  1  2  4  5  6  9 10 11 12] ...
Validation indices: [ 3  7  8 13 23 30 31 35 37 39] ...
Fold 4:
Training indices: [ 0  1  2  3  4  7  8 10 11 12] ...
Validation indices: [ 5  6  9 19 26 40 42 45 51 53] ...
Fold 5:
Training indices: [ 0  2  3  4  5  6  7  8  9 10] ...
Validation indices: [ 1 12 48 49 52 58 60 64 68 69] ...

Training set size: (800, 10)
Testing set size: (200, 10)
Class distribution in training set: Label
0    558
1    242
Name: count, dtype: int64
Class distribution in testing set: Label
0    140
1     60
Name: count, dtype: int64


In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split

# Example dataset
# Features (X) and Labels (y)
X = [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]]  # Features
y = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]                     # Labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Display the results
print("Training Features:", X_train)
print("Testing Features:", X_test)
print("Training Labels:", y_train)
print("Testing Labels:", y_test)


Training Features: [[1], [8], [3], [10], [5], [4], [7]]
Testing Features: [[9], [2], [6]]
Training Labels: [0, 1, 0, 1, 0, 1, 0]
Testing Labels: [0, 1, 1]


In [None]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
import pandas as pd

# Create an example dataset
data = {
    'Age': [25, 30, 35, 40, 45, 50, 55, 60, 65, 70],
    'Salary': [50000, 54000, 58000, 62000, 66000, 70000, 74000, 78000, 82000, 86000],
    'Purchased': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]  # 0 = No, 1 = Yes
}

# Convert the dataset to a Pandas DataFrame
df = pd.DataFrame(data)

# Separate features (X) and target/label (y)
X = df[['Age', 'Salary']]  # Features: Age and Salary
y = df['Purchased']        # Target: Purchased (0 or 1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Display the results
print("Training Features:\n", X_train)
print("\nTesting Features:\n", X_test)
print("\nTraining Labels:\n", y_train)
print("\nTesting Labels:\n", y_test)


Training Features:
    Age  Salary
0   25   50000
7   60   78000
2   35   58000
9   70   86000
4   45   66000
3   40   62000
6   55   74000

Testing Features:
    Age  Salary
8   65   82000
1   30   54000
5   50   70000

Training Labels:
 0    0
7    1
2    0
9    1
4    0
3    1
6    0
Name: Purchased, dtype: int64

Testing Labels:
 8    0
1    1
5    1
Name: Purchased, dtype: int64
