In [None]:
import time
import numpy as np
import pandas as pd
from model import dataClassifier

In [None]:
# Import Dataset
curated_bow = pd.read_csv('class_noise_cleaned.csv')
curated_bow = curated_bow.drop(['contents'], axis=1)
curated_bow.dropna()

# Remove Trailing Spaces in Column Names
curated_bow.columns = [col.strip() for col in curated_bow.columns]

# Duplicate Columns Removed
curated_bow = curated_bow.loc[:,~curated_bow.columns.duplicated()].copy()
curated_bow

In [None]:
partitions_ = 1
bin_size = len(curated_bow.index)//partitions_ 
print(bin_size)

In [None]:
def run_ATNODE():
    bins_counter = 0
    partition_values = pd.DataFrame()
    j_col= pd.DataFrame()
    
    for j in range(len(curated_bow.columns)):
        start_j = time.time()
        clustered_att = curated_bow.sort_values(by = curated_bow.columns[j])
        
        bins_counter = 0
        
        parition_mean = clustered_att.iloc[bins_counter: bins_counter + bin_size, j].values.mean()
        partition_sd = clustered_att.iloc[bins_counter: bins_counter + bin_size, j].values.std()
        mean_over_sd = parition_mean/partition_sd if partition_sd else 0

        noise_score = pd.DataFrame([np.nan_to_num(abs(x - (mean_over_sd)))
                                    for x in clustered_att.iloc[bins_counter:bins_counter + bin_size, j]])

        partition_values = pd.concat([partition_values, noise_score], axis=0)
                    
        #dropping existing indices with the same name and replacing them with new, using reset index
        partition_values= partition_values.reset_index(drop=True) 
        bins_counter = bins_counter + bin_size
            
        j_col= pd.concat([j_col, partition_values], axis= 1)
        partition_values.drop(partition_values.index, inplace=True)
        
        end_j = time.time()
        
        time_df = end_j - start_j
        time_df = format(time_df, '.6f')
        
        print(f"Runtime of ATNODE for j ({j})\t:\t{time_df}")
        
    return j_col

In [None]:
atnode = run_ATNODE()
atnode['max_noise'] = atnode.max(axis=1)

In [None]:
curated_bow.insert(len(curated_bow.columns)-1, 'max_noise', atnode['max_noise'])
curated_bow = curated_bow.sort_values(by=['max_noise'], ascending=False)
curated_bow = curated_bow.reset_index()
curated_bow = curated_bow.drop(['index'], axis=1)
curated_bow

## n Estimator = 100

In [None]:
rslts_df_100 = pd.DataFrame(columns = ["k", "Accuracy", "Precision", "Recall", "F_Score"])

In [None]:
rslts_100 = []

precisions = []
recalls = []
f_scores = []

k = 0

print("N-Estimator = 100 \n\n")

for x in range(11):
    an_cleaned = curated_bow.copy()
    
    n = round((k/100) * an_cleaned.shape[0])
    
    print(f"Results after Removing top {k}% data")
    
    an_cleaned = an_cleaned.iloc[n:]
    last_col_index = len(an_cleaned.columns)-1
    
    X = an_cleaned.iloc[:, 0:last_col_index]
    y = an_cleaned.iloc[:, -1]
    
    attrNoiseObj = dataClassifier(X, y)
    attrNoiseObj.dataAnalysis(100)
    
    acc = round(attrNoiseObj.accuracy, 3)
    pre = round(attrNoiseObj.precision, 3)
    rec = round(attrNoiseObj.recall, 3)
    f_score = round(attrNoiseObj.f_score, 3)

    print(f"Accuracy: {acc}")
    print(f"Precision: {pre}")
    print(f"Recall: {rec}")
    print(f"F-Score: {f_score}")
    
    print("\n\n")
    
    precisions.append(pre)
    recalls.append(rec)
    f_scores.append(f_score)
    
    data = {
        "k": k,
        "Accuracy": acc,
        "Precision": pre,
        "Recall": rec,
        "F_Score": f_score
    }
    
    rslts_df_100.loc[len(rslts_df_100)] = data
    
    rslts_100.append({
        'k': k, 
        'n': n, 
        'accuracy': acc,
        'precision': pre,
        'recall': rec,
        'f_score': f_score,
    })
    
    
    k += 5

In [None]:
rslts_df_100

## n Estimator = 300

In [None]:
rslts_df_300 = pd.DataFrame(columns = ["k", "Accuracy", "Precision", "Recall", "F_Score"])

In [None]:
rslts_300 = []

precisions = []
recalls = []
f_scores = []

k = 0

print("N-Estimator = 300 \n\n")

for x in range(11):
    an_cleaned = curated_bow.copy()
    
    n = round((k/100) * an_cleaned.shape[0])
    
    print(f"Results after Removing top {k}% data")
    
    an_cleaned = an_cleaned.iloc[n:]
    last_col_index = len(an_cleaned.columns)-1
    
    X = an_cleaned.iloc[:, 0:last_col_index]
    y = an_cleaned.iloc[:, -1]
    
    attrNoiseObj = dataClassifier(X, y)
    attrNoiseObj.dataAnalysis(300)
    
    acc = round(attrNoiseObj.accuracy, 3)
    pre = round(attrNoiseObj.precision, 3)
    rec = round(attrNoiseObj.recall, 3)
    f_score = round(attrNoiseObj.f_score, 3)

    print(f"Accuracy: {acc}")
    print(f"Precision: {pre}")
    print(f"Recall: {rec}")
    print(f"F-Score: {f_score}")
    
    print("\n\n")
    
    precisions.append(pre)
    recalls.append(rec)
    f_scores.append(f_score)
    
    data = {
        "k": k,
        "Accuracy": acc,
        "Precision": pre,
        "Recall": rec,
        "F_Score": f_score
    }
    
    rslts_df_300.loc[len(rslts_df_300)] = data
    
    rslts_300.append({
        'k': k, 
        'n': n, 
        'accuracy': acc,
        'precision': pre,
        'recall': rec,
        'f_score': f_score,
    })
    
    
    k += 5

In [None]:
rslts_df_300