In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json, os

In [2]:
def make_df(path_to_file, inp_size, out_size,num_data=500, examples=2):
  #inp_size, out_size, examples = 15, 15, 4      # Modify
  cols = (inp_size**2 + out_size**2) * examples
  df = pd.DataFrame(columns=list(range(cols))+['operation','output'])
  op_list = [f'Dilation SE{i}' for i in range(1,9)] + [f'Erosion SE{i}' for i in range(1,9)]
  #path_to_file = "/content/IPARC_ChallengeV2/Dataset/CatA_Simple"   # Modify

  for id in range(num_data):
    with open(f'{path_to_file}/Task{id:03}.json') as f:
      data = json.load(f)
    with open(f'{path_to_file}/Task{id:03}_soln.txt') as file:
      lines = [line.rstrip() for line in file]
    d = []
    for i in range(len(data)):
      d += [element for innerList in data[i]['input'] for element in innerList] + [element for innerList in data[i]['output'] for element in innerList]
    for op in op_list:
      df.loc[len(df)] = (d + [op, op in lines])
  
  return df

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

def make_X_y(df):
    
    y = df.pop('output')
    X = df
    X = pd.get_dummies(df, columns = ['operation'])
    X.columns = X.columns.astype(str)
    
    print(f"Percentage of positive labels in out dataset: {100*sum(y == 1)/len(y)}%")
    return X, y

def training(X, y, test_size=0.3):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    perform_dict = dict()
    model_list = [('logistic_regression', LogisticRegression(random_state=0)),
                  ('decision_tree', DecisionTreeClassifier(random_state=0)),
                  ('support_vector_classifier', SVC(gamma='auto', random_state=0)),
                  ('random_forest', RandomForestClassifier(max_depth=2, random_state=0)),
                  ('gradient_boosting', GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0))
                 ]
    
    for name, model in model_list:
        print(name)
        clf = model.fit(X_train, y_train)
        train_acc, test_acc = clf.score(X_train, y_train), clf.score(X_test, y_test)
        y_test_predictions = clf.predict(X_test)
        precision = precision_score(y_test, y_test_predictions)
        recall = recall_score(y_test, y_test_predictions)
        f1score = f1_score(y_test, y_test_predictions)
        
        perform_dict[f'{name}_train_accuracy'], perform_dict[f'{name}_test_accuracy'] = train_acc, test_acc
        perform_dict[f'{name}_precision'] = precision
        perform_dict[f'{name}_recall'] = recall
        perform_dict[f'{name}_f1'] = f1score
        
        print(f'    train accuracy = {clf.score(X_train, y_train)}')
        print(f'    test accuracy = {clf.score(X_test, y_test)}')
        print(f"    % '1's predicted: {100*sum(y_test_predictions == 1)/len(y_test_predictions)}%")
        print(f"    Precision = {precision}")
        print(f"    Recall = {recall}")
        print(f"    F1 Score = {f1score}\n\n")
    
    return perform_dict
    

In [6]:
for seq in range(2, 11):
    df = make_df(f'../Dataset/Sequence_{seq}/CatA_Simple', 15, 15, num_data=500)
    print(f"For Datasets generated with {seq} number of operations:")
    
    X, y = make_X_y(df)
    training(X, y, test_size=0.3)

For Datasets generated with 2 number of operations:
Percentage of positive labels in out dataset: 23.075%
logistic_regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


    train accuracy = 0.7714285714285715
    test accuracy = 0.76125
    % '1's predicted: 0.3333333333333333%
    Precision = 0.0
    Recall = 0.0
    F1 Score = 0.0


decision_tree
    train accuracy = 1.0
    test accuracy = 0.6541666666666667
    % '1's predicted: 23.291666666666668%
    Precision = 0.2629695885509839
    Recall = 0.26017699115044246
    F1 Score = 0.26156583629893243


support_vector_classifier


  _warn_prf(average, modifier, msg_start, len(result))


    train accuracy = 0.77125
    test accuracy = 0.7645833333333333
    % '1's predicted: 0.0%
    Precision = 0.0
    Recall = 0.0
    F1 Score = 0.0


random_forest


  _warn_prf(average, modifier, msg_start, len(result))


    train accuracy = 0.77125
    test accuracy = 0.7645833333333333
    % '1's predicted: 0.0%
    Precision = 0.0
    Recall = 0.0
    F1 Score = 0.0


gradient_boosting


  _warn_prf(average, modifier, msg_start, len(result))


    train accuracy = 0.77125
    test accuracy = 0.7645833333333333
    % '1's predicted: 0.0%
    Precision = 0.0
    Recall = 0.0
    F1 Score = 0.0


For Datasets generated with 3 number of operations:
Percentage of positive labels in out dataset: 32.925%
logistic_regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


    train accuracy = 0.68375
    test accuracy = 0.6125
    % '1's predicted: 5.083333333333333%
    Precision = 0.13114754098360656
    Recall = 0.01904761904761905
    F1 Score = 0.033264033264033266


decision_tree
    train accuracy = 1.0
    test accuracy = 0.58875
    % '1's predicted: 31.375%
    Precision = 0.40239043824701193
    Recall = 0.3607142857142857
    F1 Score = 0.3804143126177025


support_vector_classifier


  _warn_prf(average, modifier, msg_start, len(result))


    train accuracy = 0.6796428571428571
    test accuracy = 0.65
    % '1's predicted: 0.0%
    Precision = 0.0
    Recall = 0.0
    F1 Score = 0.0


random_forest


  _warn_prf(average, modifier, msg_start, len(result))


    train accuracy = 0.6796428571428571
    test accuracy = 0.65
    % '1's predicted: 0.0%
    Precision = 0.0
    Recall = 0.0
    F1 Score = 0.0


gradient_boosting
    train accuracy = 0.6801785714285714
    test accuracy = 0.64875
    % '1's predicted: 0.2916666666666667%
    Precision = 0.2857142857142857
    Recall = 0.002380952380952381
    F1 Score = 0.004722550177095632


For Datasets generated with 4 number of operations:
Percentage of positive labels in out dataset: 41.9%
logistic_regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


    train accuracy = 0.6132142857142857
    test accuracy = 0.4870833333333333
    % '1's predicted: 27.791666666666668%
    Precision = 0.30884557721139433
    Recall = 0.21106557377049182
    F1 Score = 0.2507608034083993


decision_tree
    train accuracy = 1.0
    test accuracy = 0.605
    % '1's predicted: 40.916666666666664%
    Precision = 0.5142566191446029
    Recall = 0.5174180327868853
    F1 Score = 0.515832482124617


support_vector_classifier


  _warn_prf(average, modifier, msg_start, len(result))


    train accuracy = 0.5757142857142857
    test accuracy = 0.5933333333333334
    % '1's predicted: 0.0%
    Precision = 0.0
    Recall = 0.0
    F1 Score = 0.0


random_forest


  _warn_prf(average, modifier, msg_start, len(result))


    train accuracy = 0.5757142857142857
    test accuracy = 0.5933333333333334
    % '1's predicted: 0.0%
    Precision = 0.0
    Recall = 0.0
    F1 Score = 0.0


gradient_boosting
    train accuracy = 0.58875
    test accuracy = 0.5483333333333333
    % '1's predicted: 15.5%
    Precision = 0.3548387096774194
    Recall = 0.13524590163934427
    F1 Score = 0.19584569732937687


For Datasets generated with 5 number of operations:
Percentage of positive labels in out dataset: 48.5%
logistic_regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


    train accuracy = 0.5985714285714285
    test accuracy = 0.4583333333333333
    % '1's predicted: 43.708333333333336%
    Precision = 0.449952335557674
    Recall = 0.39497907949790795
    F1 Score = 0.42067736185383237


decision_tree
    train accuracy = 1.0
    test accuracy = 0.5716666666666667
    % '1's predicted: 48.791666666666664%
    Precision = 0.5713065755764304
    Recall = 0.5598326359832636
    F1 Score = 0.5655114116652579


support_vector_classifier
    train accuracy = 0.5405357142857142
    test accuracy = 0.4975
    % '1's predicted: 9.041666666666666%
    Precision = 0.47465437788018433
    Recall = 0.08619246861924686
    F1 Score = 0.14589235127478753


random_forest
    train accuracy = 0.5210714285714285
    test accuracy = 0.5008333333333334
    % '1's predicted: 0.125%
    Precision = 0.0
    Recall = 0.0
    F1 Score = 0.0


gradient_boosting
    train accuracy = 0.575
    test accuracy = 0.4766666666666667
    % '1's predicted: 42.791666666666664%
    Pr

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


    train accuracy = 0.6178571428571429
    test accuracy = 0.49
    % '1's predicted: 64.125%
    Precision = 0.5367121507472384
    Recall = 0.6178010471204188
    F1 Score = 0.5744089012517385


decision_tree
    train accuracy = 1.0
    test accuracy = 0.6633333333333333
    % '1's predicted: 53.541666666666664%
    Precision = 0.7058365758754864
    Recall = 0.6783844427823486
    F1 Score = 0.6918382913806256


support_vector_classifier
    train accuracy = 0.5503571428571429
    test accuracy = 0.5583333333333333
    % '1's predicted: 99.79166666666667%
    Precision = 0.5578288100208768
    Recall = 0.9992520568436799
    F1 Score = 0.7159699892818864


random_forest
    train accuracy = 0.54875
    test accuracy = 0.5570833333333334
    % '1's predicted: 100.0%
    Precision = 0.5570833333333334
    Recall = 1.0
    F1 Score = 0.7155472303987156


gradient_boosting
    train accuracy = 0.5930357142857143
    test accuracy = 0.5275
    % '1's predicted: 72.875%
    Precision = 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


    train accuracy = 0.6376785714285714
    test accuracy = 0.5579166666666666
    % '1's predicted: 75.29166666666667%
    Precision = 0.6225788599889319
    Recall = 0.7480053191489362
    F1 Score = 0.6795530051344005


decision_tree
    train accuracy = 1.0
    test accuracy = 0.6333333333333333
    % '1's predicted: 60.75%
    Precision = 0.7139917695473251
    Recall = 0.692154255319149
    F1 Score = 0.7029034436191763


support_vector_classifier
    train accuracy = 0.6028571428571429
    test accuracy = 0.6266666666666667
    % '1's predicted: 100.0%
    Precision = 0.6266666666666667
    Recall = 1.0
    F1 Score = 0.7704918032786886


random_forest
    train accuracy = 0.6028571428571429
    test accuracy = 0.6266666666666667
    % '1's predicted: 100.0%
    Precision = 0.6266666666666667
    Recall = 1.0
    F1 Score = 0.7704918032786886


gradient_boosting
    train accuracy = 0.6192857142857143
    test accuracy = 0.59625
    % '1's predicted: 84.875%
    Precision = 0.63

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


    train accuracy = 0.6978571428571428
    test accuracy = 0.6508333333333334
    % '1's predicted: 88.04166666666667%
    Precision = 0.6862281116895409
    Recall = 0.8923076923076924
    F1 Score = 0.7758159443552701


decision_tree
    train accuracy = 1.0
    test accuracy = 0.7008333333333333
    % '1's predicted: 70.375%
    Precision = 0.76850207223209
    Recall = 0.7987692307692308
    F1 Score = 0.7833433916716959


support_vector_classifier
    train accuracy = 0.6730357142857143
    test accuracy = 0.6770833333333334
    % '1's predicted: 100.0%
    Precision = 0.6770833333333334
    Recall = 1.0
    F1 Score = 0.8074534161490683


random_forest
    train accuracy = 0.6730357142857143
    test accuracy = 0.6770833333333334
    % '1's predicted: 100.0%
    Precision = 0.6770833333333334
    Recall = 1.0
    F1 Score = 0.8074534161490683


gradient_boosting
    train accuracy = 0.6778571428571428
    test accuracy = 0.66375
    % '1's predicted: 92.58333333333333%
    Preci

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


    train accuracy = 0.7558928571428571
    test accuracy = 0.6929166666666666
    % '1's predicted: 79.16666666666667%
    Precision = 0.7547368421052632
    Recall = 0.8410557184750733
    F1 Score = 0.7955617198335645


decision_tree
    train accuracy = 1.0
    test accuracy = 0.74625
    % '1's predicted: 71.16666666666667%
    Precision = 0.8208430913348946
    Recall = 0.8222873900293255
    F1 Score = 0.8215646059185467


support_vector_classifier
    train accuracy = 0.7105357142857143
    test accuracy = 0.7104166666666667
    % '1's predicted: 100.0%
    Precision = 0.7104166666666667
    Recall = 1.0
    F1 Score = 0.830694275274056


random_forest
    train accuracy = 0.7105357142857143
    test accuracy = 0.7104166666666667
    % '1's predicted: 100.0%
    Precision = 0.7104166666666667
    Recall = 1.0
    F1 Score = 0.830694275274056


gradient_boosting
    train accuracy = 0.7371428571428571
    test accuracy = 0.6941666666666667
    % '1's predicted: 85.29166666666667

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


    train accuracy = 0.7994642857142857
    test accuracy = 0.72875
    % '1's predicted: 71.75%
    Precision = 0.8042973286875726
    Recall = 0.8151854031783402
    F1 Score = 0.8097047646886875


decision_tree
    train accuracy = 1.0
    test accuracy = 0.7575
    % '1's predicted: 70.45833333333333%
    Precision = 0.8302779420461266
    Recall = 0.8263684520306063
    F1 Score = 0.8283185840707965


support_vector_classifier
    train accuracy = 0.7148214285714286
    test accuracy = 0.7079166666666666
    % '1's predicted: 100.0%
    Precision = 0.7079166666666666
    Recall = 1.0
    F1 Score = 0.8289826787021224


random_forest
    train accuracy = 0.7148214285714286
    test accuracy = 0.7079166666666666
    % '1's predicted: 100.0%
    Precision = 0.7079166666666666
    Recall = 1.0
    F1 Score = 0.8289826787021224


gradient_boosting
    train accuracy = 0.7769642857142857
    test accuracy = 0.7341666666666666
    % '1's predicted: 75.125%
    Precision = 0.7942318358291

In [None]:
data_size = dict()
for size in range(9,24,2):
    print(f"For Datasets generated with Input and Output grid of size {size}:\n")
    df = make_df(f'../Dataset/size_{size:02}', size, size, num_data=500)
    X, y = make_X_y(df)
    data_size[size] = training(X, y, test_size=0.3)

For Datasets generated with Input and Output grid of size 9:

Percentage of positive labels in out dataset: 43.55%
logistic_regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


    train accuracy = 0.6066071428571429
    test accuracy = 0.5416666666666666
    % '1's predicted: 28.916666666666668%
    Precision = 0.4668587896253602
    Recall = 0.30740037950664134
    F1 Score = 0.3707093821510297


decision_tree
    train accuracy = 1.0
    test accuracy = 0.6808333333333333
    % '1's predicted: 43.833333333333336%
    Precision = 0.6368821292775665
    Recall = 0.635673624288425
    F1 Score = 0.6362773029439696


support_vector_classifier


  _warn_prf(average, modifier, msg_start, len(result))


    train accuracy = 0.5667857142857143
    test accuracy = 0.5608333333333333
    % '1's predicted: 0.0%
    Precision = 0.0
    Recall = 0.0
    F1 Score = 0.0


random_forest


  _warn_prf(average, modifier, msg_start, len(result))


    train accuracy = 0.5660714285714286
    test accuracy = 0.5608333333333333
    % '1's predicted: 0.0%
    Precision = 0.0
    Recall = 0.0
    F1 Score = 0.0


gradient_boosting
    train accuracy = 0.6044642857142857
    test accuracy = 0.5670833333333334
    % '1's predicted: 24.041666666666668%
    Precision = 0.512998266897747
    Recall = 0.2808349146110057
    F1 Score = 0.3629675045984058


For Datasets generated with Input and Output grid of size 11:

Percentage of positive labels in out dataset: 47.1%
logistic_regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


    train accuracy = 0.5867857142857142
    test accuracy = 0.47375
    % '1's predicted: 37.041666666666664%
    Precision = 0.4341957255343082
    Recall = 0.3368237347294939
    F1 Score = 0.37936117936117936


decision_tree
    train accuracy = 1.0
    test accuracy = 0.64375
    % '1's predicted: 46.541666666666664%
    Precision = 0.630259623992838
    Recall = 0.6143106457242583
    F1 Score = 0.6221829429960231


support_vector_classifier
    train accuracy = 0.5466071428571428
    test accuracy = 0.5170833333333333
    % '1's predicted: 3.9583333333333335%
    Precision = 0.43157894736842106
    Recall = 0.03577661431064572
    F1 Score = 0.06607574536663981


random_forest


  _warn_prf(average, modifier, msg_start, len(result))


    train accuracy = 0.5317857142857143
    test accuracy = 0.5225
    % '1's predicted: 0.0%
    Precision = 0.0
    Recall = 0.0
    F1 Score = 0.0


gradient_boosting
    train accuracy = 0.57875
    test accuracy = 0.51
    % '1's predicted: 31.083333333333332%
    Precision = 0.47989276139410186
    Recall = 0.31239092495637
    F1 Score = 0.3784355179704017


For Datasets generated with Input and Output grid of size 13:

Percentage of positive labels in out dataset: 47.475%
logistic_regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


    train accuracy = 0.5983928571428572
    test accuracy = 0.45916666666666667
    % '1's predicted: 45.916666666666664%
    Precision = 0.41560798548094374
    Recall = 0.4118705035971223
    F1 Score = 0.4137308039747064


decision_tree
    train accuracy = 1.0
    test accuracy = 0.57125
    % '1's predicted: 47.291666666666664%
    Precision = 0.5365638766519824
    Recall = 0.5476618705035972
    F1 Score = 0.5420560747663551


support_vector_classifier
    train accuracy = 0.5541071428571429
    test accuracy = 0.5225
    % '1's predicted: 18.0%
    Precision = 0.46064814814814814
    Recall = 0.1789568345323741
    F1 Score = 0.25777202072538863


random_forest
    train accuracy = 0.5207142857142857
    test accuracy = 0.5370833333333334
    % '1's predicted: 0.041666666666666664%
    Precision = 1.0
    Recall = 0.0008992805755395684
    F1 Score = 0.0017969451931716081


gradient_boosting
    train accuracy = 0.5757142857142857
    test accuracy = 0.4920833333333333
    % '1

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


    train accuracy = 0.6010714285714286
    test accuracy = 0.4683333333333333
    % '1's predicted: 46.666666666666664%
    Precision = 0.44375
    Recall = 0.43217391304347824
    F1 Score = 0.4378854625550661


decision_tree
    train accuracy = 1.0
    test accuracy = 0.595
    % '1's predicted: 48.583333333333336%
    Precision = 0.5763293310463122
    Recall = 0.5843478260869566
    F1 Score = 0.5803108808290156


support_vector_classifier
    train accuracy = 0.5517857142857143
    test accuracy = 0.51375
    % '1's predicted: 14.375%
    Precision = 0.4753623188405797
    Recall = 0.1426086956521739
    F1 Score = 0.21939799331103677


random_forest
    train accuracy = 0.5207142857142857
    test accuracy = 0.5220833333333333
    % '1's predicted: 0.7083333333333334%
    Precision = 0.5882352941176471
    Recall = 0.008695652173913044
    F1 Score = 0.017137960582690664


gradient_boosting
    train accuracy = 0.58
    test accuracy = 0.5054166666666666
    % '1's predicted: 4

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


    train accuracy = 0.6035714285714285
    test accuracy = 0.45916666666666667
    % '1's predicted: 42.125%
    Precision = 0.410484668644906
    Recall = 0.37153088630259623
    F1 Score = 0.39003759398496235


decision_tree
    train accuracy = 1.0
    test accuracy = 0.5279166666666667
    % '1's predicted: 47.833333333333336%
    Precision = 0.4930313588850174
    Recall = 0.5067144136078783
    F1 Score = 0.4997792494481236


support_vector_classifier
    train accuracy = 0.5323214285714286
    test accuracy = 0.5270833333333333
    % '1's predicted: 2.5833333333333335%
    Precision = 0.3548387096774194
    Recall = 0.019695613249776187
    F1 Score = 0.037319762510602206


random_forest


  _warn_prf(average, modifier, msg_start, len(result))


    train accuracy = 0.5226785714285714
    test accuracy = 0.5345833333333333
    % '1's predicted: 0.0%
    Precision = 0.0
    Recall = 0.0
    F1 Score = 0.0


gradient_boosting
    train accuracy = 0.58125
    test accuracy = 0.4979166666666667
    % '1's predicted: 40.75%
    Precision = 0.4550102249488753
    Recall = 0.39838854073410923
    F1 Score = 0.4248210023866349


For Datasets generated with Input and Output grid of size 19:

Percentage of positive labels in out dataset: 47.975%
logistic_regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


    train accuracy = 0.60375
    test accuracy = 0.4583333333333333
    % '1's predicted: 43.875%
    Precision = 0.42924976258309594
    Recall = 0.39270199826238056
    F1 Score = 0.41016333938294014


decision_tree
    train accuracy = 1.0
    test accuracy = 0.5325
    % '1's predicted: 46.791666666666664%
    Precision = 0.5129118432769367
    Recall = 0.5004344048653345
    F1 Score = 0.5065963060686015


support_vector_classifier
    train accuracy = 0.5401785714285714
    test accuracy = 0.5025
    % '1's predicted: 7.208333333333333%
    Precision = 0.37572254335260113
    Recall = 0.05647263249348393
    F1 Score = 0.09818731117824774


random_forest
    train accuracy = 0.5203571428571429
    test accuracy = 0.52
    % '1's predicted: 0.041666666666666664%
    Precision = 0.0
    Recall = 0.0
    F1 Score = 0.0


gradient_boosting
    train accuracy = 0.5816071428571429
    test accuracy = 0.47541666666666665
    % '1's predicted: 44.333333333333336%
    Precision = 0.449248

In [None]:
import matplotlib.pyplot as plt
model_list = ['logistic_regression', 'decision_tree', 'support_vector_classifier', 'random_forest', 'gradient_boosting']
                 
x = list(range(2,11))

for model in model_list:
    y = [data_dict[size][f'{name}_test_accuracy'] for size in range(2, 11)]
    plt.plot(x,y, label=model)
plt.xlabel('No. of sequence')
plt.ylabel('Test accuracy')
plt.title('Test accuracies')
plt.show()

In [None]:
df = pd.DataFrame()
x = list(range(2,11))
df['size'] = x
for model in model_list:
    y = [data_dict[size][f'{name}_test_accuracy'] for size in range(2, 11)]
    df[model]  = y
df.to_csv("test_acc.csv")