In [1]:
from sklearn import preprocessing

In [2]:
from tensorflow.keras.layers import GRU

In [3]:
from tensorflow.keras.models import Sequential

In [4]:
from tensorflow.keras.layers import Dense, Dropout, Activation

In [5]:
import pandas as pd

In [6]:
df = pd.read_csv("adult_50k.csv", sep=",")
df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,capital,income
0,39,State-gov,77516,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,2174,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,0,<=50K
2,38,Private,215646,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,0,<=50K
3,53,Private,234721,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,0,<=50K
4,28,Private,338409,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,0,<=50K
5,37,Private,284582,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,40,United-States,0,<=50K
6,49,Private,160187,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,16,Jamaica,0,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,45,United-States,0,>50K
8,31,Private,45781,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,50,United-States,14084,>50K
9,42,Private,159449,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,40,United-States,5178,>50K


In [7]:
from enum import Enum

class ContinuousCategorical(Enum):
    CONTINUOUS = 1
    CATEGORICAL = 2
class ClassifiedSeries(object):

    def __init__(self, series: pd.core.series.Series):
        self.series = series
        self.unique_values_amount = self.series.unique().size
        self.is_unique = self.series.size == self.unique_values_amount

    @property
    def continuous_or_categorical(self) -> ContinuousCategorical:
        if self.series.dtype == 'int64':
            if self.is_unique or self.unique_values_amount <= 10:
                return ContinuousCategorical.CATEGORICAL
            return ContinuousCategorical.CONTINUOUS
        return ContinuousCategorical.CATEGORICAL
    
    @property
    def values_range_representation(self) -> str:
        if self.continuous_or_categorical == ContinuousCategorical.CATEGORICAL:
            return ', '.join(self.series.unique())
        return f'{self.series.min()} to {self.series.max()}'
    
    def part1_repr(self):
        return f'{self.series.name}  - {self.continuous_or_categorical.name.lower()} - {self.unique_values_amount} values - {self.values_range_representation}'
    
    @property
    def minimal_occurence_for_balance(self):
        if self.continuous_or_categorical == ContinuousCategorical.CONTINUOUS:
            return 0
        return 100 / self.unique_values_amount / 10
        
    @property
    def is_balanced(self):
        if self.continuous_or_categorical == ContinuousCategorical.CATEGORICAL:
            return min(self.distribution.values()) > self.minimal_occurence_for_balance
        return True
    
    @property
    def distribution(self):
        return self.series.value_counts(normalize=True).mul(100).round(1).to_dict()
        
    def part2_repr(self):
        result = f"{self.series.name} - {self.continuous_or_categorical.name.lower()} - "
        if self.is_balanced:
            result += "balanced"
        else:
            result += "imbalanced"
        if self.continuous_or_categorical == ContinuousCategorical.CATEGORICAL:
            result += f" - {self.minimal_occurence_for_balance}% min occurence - {', '.join(f'{k} {v}%' for k, v in self.distribution.items())}"
        return result
    
    def part3_repr(self):
        result = f"{self.series.name} - {self.continuous_or_categorical.name.lower()} - "
        if self.is_balanced:
            result += "balanced"
        else:
            result += "imbalanced"
        return result
        
        


In [8]:
for column in df:
    print(ClassifiedSeries(getattr(df, column)).part1_repr())

age  - continuous - 74 values - 17 to 90
workclass  - categorical - 7 values - State-gov, Self-emp-not-inc, Private, Federal-gov, Local-gov, ?, Self-emp-inc
fnlwgt  - continuous - 28523 values - 12285 to 1490400
education  - categorical - 16 values - Bachelors, HS-grad, 11th, Masters, 9th, Some-college, Assoc-acdm, Assoc-voc, 7th-8th, Doctorate, Prof-school, 5th-6th, 10th, 1st-4th, Preschool, 12th
marital-status  - categorical - 7 values - Never-married, Married-civ-spouse, Divorced, Married-spouse-absent, Separated, Married-AF-spouse, Widowed
occupation  - categorical - 15 values - Adm-clerical, Exec-managerial, Handlers-cleaners, Prof-specialty, Other-service, Sales, Craft-repair, Transport-moving, Farming-fishing, Machine-op-inspct, Tech-support, ?, Protective-serv, Armed-Forces, Priv-house-serv
relationship  - categorical - 6 values - Not-in-family, Husband, Wife, Own-child, Unmarried, Other-relative
race  - categorical - 5 values - White, Black, Asian-Pac-Islander, Amer-Indian-Esk

In [9]:
for column in df:
    print(ClassifiedSeries(getattr(df, column)).part2_repr())

age - continuous - balanced
workclass - categorical - balanced - 1.4285714285714286% min occurence - Private 69.4%, Self-emp-not-inc 7.9%, Local-gov 6.4%, ? 5.8%, State-gov 4.1%, Self-emp-inc 3.5%, Federal-gov 2.9%
fnlwgt - continuous - balanced
education - categorical - imbalanced - 0.625% min occurence - HS-grad 32.3%, Some-college 22.3%, Bachelors 16.4%, Masters 5.4%, Assoc-voc 4.2%, 11th 3.7%, Assoc-acdm 3.3%, 10th 2.8%, 7th-8th 1.9%, Prof-school 1.7%, 9th 1.5%, 12th 1.3%, Doctorate 1.2%, 5th-6th 1.0%, 1st-4th 0.5%, Preschool 0.2%
marital-status - categorical - imbalanced - 1.4285714285714286% min occurence - Married-civ-spouse 45.8%, Never-married 33.0%, Divorced 13.6%, Separated 3.1%, Widowed 3.1%, Married-spouse-absent 1.3%, Married-AF-spouse 0.1%
occupation - categorical - imbalanced - 0.6666666666666667% min occurence - Prof-specialty 12.6%, Craft-repair 12.5%, Exec-managerial 12.5%, Adm-clerical 11.4%, Sales 11.3%, Other-service 10.1%, Machine-op-inspct 6.2%, ? 5.8%, Transpor

In [10]:
added_lines_count = 0
while True:
    need_one_more_entry = False
    for column in df:
        cs = ClassifiedSeries(getattr(df, column))
        if column != 'native-country' and not cs.is_balanced:
            need_one_more_entry = True
    if need_one_more_entry:
        added_lines_count += 1
        new_line = dict()
        for column in df:
            reversed_distribution_dict = {v: k for k, v in ClassifiedSeries(getattr(df, column)).distribution.items()}
            new_line[column] = reversed_distribution_dict[min(reversed_distribution_dict.keys())]
        print(new_line)
        df.append(new_line, ignore_index=True)
    else:
        break
print(f'{added_lines_count} rows added')

SyntaxError: invalid syntax (2276512255.py, line 6)

In [None]:
for column in df:
    print(ClassifiedSeries(getattr(df, column)).part3_repr())