<a href="https://colab.research.google.com/github/yeesem/Machine-Learning/blob/main/HVC_DTS_(Classification_and_Clustering).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import zipfile

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!unzip '/content/drive/MyDrive/Test 2/hvc_dts.csv.gzip'

Archive:  /content/drive/MyDrive/Test 2/hvc_dts.csv.gzip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of /content/drive/MyDrive/Test 2/hvc_dts.csv.gzip or
        /content/drive/MyDrive/Test 2/hvc_dts.csv.gzip.zip, and cannot find /content/drive/MyDrive/Test 2/hvc_dts.csv.gzip.ZIP, period.


In [None]:
# Read the CSV file into a pandas DataFrame
data = pd.read_csv('/content/drive/MyDrive/Test 2/hvc_dts.csv.gzip')

In [None]:
data.head()

Unnamed: 0,Site,Segment,Timestamp,Location,ElectricalCurrent,ElectricalVoltage,Temperature,Performance
0,Oceania,foo-ba,2018-04-19 16:12:23,31.42,4817.024581,572196.9,92.584483,UNDERPERFORMING
1,Oceania,foo-ba,2018-04-19 16:12:23,1665.05,2592.415864,1063212.0,78.983756,UNDERPERFORMING
2,Oceania,foo-ba,2018-04-19 16:12:23,3298.67,1051.501885,2621285.0,69.346272,
3,Oceania,foo-ba,2018-04-19 16:12:23,4932.3,194.263779,14188370.0,70.354899,UNDERPERFORMING
4,Oceania,foo-ba,2018-04-19 16:12:23,6565.93,20.713071,133069900.0,63.940873,UNDERPERFORMING


# Data Cleaning

In [None]:
data['Performance'].value_counts()

Performance
UNDERPERFORMING    8073272
RUNNING             171824
BREAKING                 5
Name: count, dtype: int64

In [None]:
# Drop the duplicated rows
data.duplicated().sum()

0

In [None]:
data.isnull().sum()

Site                 1093720
Segment              1093720
Timestamp                  0
Location                   0
ElectricalCurrent          0
ElectricalVoltage          0
Temperature                0
Performance          2061275
dtype: int64

In [None]:
data = data.dropna(subset = ['Site','Segment'])

In [None]:
data = data.drop(columns = ['Timestamp','Location'])

In [None]:
data.isnull().sum()

Site                       0
Segment                    0
ElectricalCurrent          0
ElectricalVoltage          0
Temperature                0
Performance          1842743
dtype: int64

In [None]:
data[data['Performance'] == 'BREAKING']

Unnamed: 0,Site,Segment,ElectricalCurrent,ElectricalVoltage,Temperature,Performance
6715746,Gale,hogera-hogehoge,0.0,0.0,56.7,BREAKING
6715748,Gale,hogera-hogehoge,0.0,0.0,56.7,BREAKING
6715749,Gale,hogera-hogehoge,0.0,0.0,56.7,BREAKING
6715750,Gale,hogera-hogehoge,0.0,0.0,56.7,BREAKING
6715751,Gale,hogera-hogehoge,0.0,0.0,56.7,BREAKING


In [None]:
data[data['Performance'] == 'RUNNING'].isnull().sum()

Site                 0
Segment              0
ElectricalCurrent    0
ElectricalVoltage    0
Temperature          0
Performance          0
dtype: int64

In [None]:
data[data['Performance'] == 'UNDERPERFORMING'].isnull().sum()

Site                 0
Segment              0
ElectricalCurrent    0
ElectricalVoltage    0
Temperature          0
Performance          0
dtype: int64

In [None]:
performance_with_null_df = data[data['Performance'].isnull()]
print("Number of rows of performance_with_null_df : ",len(performance_with_null_df))
performance_with_null_df.head()

Number of rows of performance_with_null_df :  1842743


Unnamed: 0,Site,Segment,ElectricalCurrent,ElectricalVoltage,Temperature,Performance
2,Oceania,foo-ba,1051.501885,2621285.0,69.346272,
9,Oceania,r-qu,74.182785,37181920.0,66.213282,
14,Oceania,ba-r,168.804692,16288060.0,60.649075,
17,Oceania,qu-x,57.997403,47260480.0,75.220716,
18,Oceania,qu-x,38.351769,71469580.0,73.269612,


**Up Sampling**

In [None]:
# Separate majority and minority classes
minority_class = data[data['Performance'] == 'BREAKING']
majority_class = data[data['Performance'] == 'RUNNING']

# Upsample minority class
upsampled_minority_class = minority_class.sample(n=len(majority_class), replace=True, random_state=42)

# Concatenate the upsampled minority class with the majority class
upsampled_data = pd.concat([majority_class, upsampled_minority_class])

# Shuffle the DataFrame to mix up the classes
upsampled_df = upsampled_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
upsampled_df['Performance'].value_counts()

Performance
BREAKING    171824
RUNNING     171824
Name: count, dtype: int64

**Down Sampling**

In [None]:
# Assuming df is your DataFrame with imbalanced classes
# Assuming 'target_column' is the column representing the target variable

# Separate majority and minority classes
majority_class = data[data['Performance'] == 'UNDERPERFORMING']
minority_class = data[data['Performance'] == 'RUNNING']

# Downsample majority class
downsampled_majority_class = majority_class.sample(n=len(minority_class), random_state=42)

# Concatenate the downsampled majority class with the minority class
down_sampled_df = pd.concat([downsampled_majority_class, minority_class])

# Shuffle the DataFrame to mix up the classes
down_sampled_df = down_sampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
down_sampled_df['Performance'].value_counts()

Performance
RUNNING            171824
UNDERPERFORMING    171824
Name: count, dtype: int64

In [None]:
balance_df = pd.concat([upsampled_df,down_sampled_df[down_sampled_df['Performance'] == 'UNDERPERFORMING']],axis = 0)

balance_df['Performance'].value_counts()

Performance
BREAKING           171824
RUNNING            171824
UNDERPERFORMING    171824
Name: count, dtype: int64

In [None]:
performance_dir = {'BREAKING' : 0,'RUNNING' : 1,'UNDERPERFORMING' : 2}

balance_df['Performance'] = balance_df['Performance'].map(performance_dir)

balance_df.head()

Unnamed: 0,Site,Segment,ElectricalCurrent,ElectricalVoltage,Temperature,Performance
0,Gale,hogera-hogehoge,0.0,0.0,56.7,0
1,Gale,hogera-hogehoge,0.0,0.0,56.7,0
2,Oceania,foo-ba,1057.268982,2534718.0,69.328253,1
3,Gale,hogera-hogehoge,3610.031861,744920.6,87.049824,1
4,Gale,hogera-hogehoge,0.0,0.0,56.7,0


In [None]:
norm_balance_df = pd.get_dummies(balance_df,columns = ['Segment','Site']).astype(int)
norm_balance_df.head()

Unnamed: 0,ElectricalCurrent,ElectricalVoltage,Temperature,Performance,Segment_ba-r,Segment_ba-z,Segment_corge-garply,Segment_foo-ba,Segment_foobar-corge,Segment_fred-waldo,...,Segment_qu-ux,Segment_qu-x,Segment_r-qu,Segment_thud-fred,Segment_thud-xyzzy,Segment_xyzzy-plugh,Segment_z-qu,Site_Gale,Site_Oceania,Site_Slump
0,0,0,56,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,56,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1057,2534718,69,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,3610,744920,87,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,56,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [None]:
from sklearn.preprocessing import StandardScaler

X = norm_balance_df.drop(columns = ['Performance'])
y = norm_balance_df['Performance']

# Create StandardScaler object
scaler = StandardScaler()

# Fit and transform the data
scaled_X = scaler.fit_transform(X)

scaled_X = pd.DataFrame(scaled_X,columns = norm_balance_df.drop(columns = ['Performance']).columns)

scaled_X.head()

Unnamed: 0,ElectricalCurrent,ElectricalVoltage,Temperature,Segment_ba-r,Segment_ba-z,Segment_corge-garply,Segment_foo-ba,Segment_foobar-corge,Segment_fred-waldo,Segment_fuga-piyo,...,Segment_qu-ux,Segment_qu-x,Segment_r-qu,Segment_thud-fred,Segment_thud-xyzzy,Segment_xyzzy-plugh,Segment_z-qu,Site_Gale,Site_Oceania,Site_Slump
0,-0.582901,-0.172152,-1.11594,-0.138886,-0.083652,-0.189175,-0.283594,-0.141519,-0.138799,-0.067819,...,-0.147507,-0.232678,-0.153777,-0.078849,-0.167246,-0.212325,-0.108448,0.745617,-0.51199,-0.419445
1,-0.582901,-0.172152,-1.11594,-0.138886,-0.083652,-0.189175,-0.283594,-0.141519,-0.138799,-0.067819,...,-0.147507,-0.232678,-0.153777,-0.078849,-0.167246,-0.212325,-0.108448,0.745617,-0.51199,-0.419445
2,0.150093,-0.169122,-0.007932,-0.138886,-0.083652,-0.189175,3.526173,-0.141519,-0.138799,-0.067819,...,-0.147507,-0.232678,-0.153777,-0.078849,-0.167246,-0.212325,-0.108448,-1.341171,1.953162,-0.419445
3,1.920514,-0.171262,1.526234,-0.138886,-0.083652,-0.189175,-0.283594,-0.141519,-0.138799,-0.067819,...,-0.147507,-0.232678,-0.153777,-0.078849,-0.167246,-0.212325,-0.108448,0.745617,-0.51199,-0.419445
4,-0.582901,-0.172152,-1.11594,-0.138886,-0.083652,-0.189175,-0.283594,-0.141519,-0.138799,-0.067819,...,-0.147507,-0.232678,-0.153777,-0.078849,-0.167246,-0.212325,-0.108448,0.745617,-0.51199,-0.419445


In [None]:
# Split data into test and train sets
from sklearn.model_selection import train_test_split

# Assuming scaled_data is your feature matrix and labels is your target variable
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2, random_state=42)

# Modelling

**KNeighborsClassifier**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Assuming X_train, X_test, y_train, y_test are your training and testing data
# Create a KNN classifier with k=5
knn = KNeighborsClassifier(n_neighbors=3)

# Train the classifier on the training data
knn.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = knn.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9581647994568117


**LogisticRegression**

In [None]:
from sklearn.linear_model import LogisticRegression

# Assuming X_train, X_test, y_train, y_test are your training and testing data
# Create a logistic regression classifier
logreg = LogisticRegression()

# Train the classifier on the training data
logreg.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = logreg.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9557398515931907


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
