In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# read the files
df_train_file = pd.read_csv('train.csv')
df_test_file = pd.read_csv('test.csv')
df_col_label = pd.read_csv('codebook.csv')

In [36]:
# show the info of the train file
# df_train_file.info()
# df_train_file.head()

In [37]:
# show the info of the test file
# df_test_file.info()
# df_test_file.head()

In [38]:
# convert the training data and test data into numpy arrays
test_data = df_test_file.to_numpy()

y = df_train_file['target'].values
full_data = df_train_file.drop('target', axis=1)

print(f'The proportion of middle class in the training set is {sum(y)/len(y)}')

The proportion of middle class in the training set is 0.2102622690244387


In [48]:
# split the X into continuous and discrete
discrete_cols = []
binary_cols = []
variable_names = df_col_label['Variable Name'].values
variable_type = df_col_label['Type'].values

mapping = {}
states = []
for index, name in enumerate(variable_names):
    mapping[name] = variable_type[index]
    
for index, name in enumerate(list(full_data.columns)):
    if not name.startswith('state') and mapping[name] != 'continuous':
        discrete_cols.append(name)
        
for i in list(full_data.columns):
    if i.startswith('state'):
        discrete_cols.append(i)
        binary_cols.append(i)

df_discrete_test_data = df_test_file[discrete_cols].copy()
df_discrete_cols = full_data[discrete_cols].copy()
df_continuous_cols = full_data.drop(discrete_cols, axis=1)
df_binary_cols = full_data[binary_cols].copy()

In [40]:
# this is processing for the standardization (no need for binary)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
continuous_cols_st = scaler.fit_transform(df_continuous_cols)

In [41]:
# this is PCA
from sklearn.decomposition import PCA
continuous_cols_st_copy = continuous_cols_st.copy()
pca = PCA(n_components=4)
continuous_cols_st_pca = pca.fit_transform(continuous_cols_st_copy)

In [42]:
# choose the training data here
from sklearn.preprocessing import PolynomialFeatures
all_discrete = df_discrete_cols.to_numpy()
all_binary = df_binary_cols.to_numpy()
all_continuous = df_continuous_cols.to_numpy()
all_pca = pd.concat([pd.DataFrame(continuous_cols_st_pca), df_discrete_cols], axis=1).to_numpy()
all_binary_pca = pd.concat([pd.DataFrame(continuous_cols_st_pca), df_binary_cols], axis=1).to_numpy()

X = all_discrete

print((all_discrete.shape[1]))
print(all_binary.shape[1])
print((all_continuous.shape[1]))
print((all_continuous.shape[0]))

117
40
42
20132


In [51]:
# set seed
seed = 42

In [44]:
# split the training data into training and validation
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2, random_state=seed)

In [45]:
# import the accuracy_score
from sklearn.metrics import accuracy_score

# simple dummy classifier
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy='most_frequent', random_state=seed)
dummy_clf.fit(X_train, y_train)
print(dummy_clf.score(X_validation, y_validation))
print(f'The percentage of people from middle is {np.average(y)}')

0.783213310156444
The percentage of people from middle is 0.2102622690244387


In [46]:
from sklearn.linear_model import SGDClassifier
# from sklearn.svm import SVC too much time (quadratic, better to use linear SVC or SGD)
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier # shit: too much time to run and worse results
from sklearn.ensemble import GradientBoostingClassifier # shit: too much time to run and worse results
from xgboost import XGBClassifier # shit: too much time to run and worse results
# from sklearn.naive_bayes import GaussianNB
# from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB # not a good one
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis # shit
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis # shit
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
# logistic regression
X = all_discrete

parameters = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'alpha': [0.001, 0.01, 0.1],
    'learning_rate': ['constant', 'optimal', 'adaptive', 'invscaling'],
    'eta0': [0.001, 0.01, 0.1],
}

model = SGDClassifier(
    loss='log_loss',
    class_weight='balanced',
    random_state=seed,
)

cv = GridSearchCV(model, parameters, n_jobs=-1, cv=5, scoring={'F1': 'f1', 'Acc': 'accuracy'}, refit='F1')
cv.fit(X, y)

print(f"The best F1 score: {cv.best_score_}, the std: {cv.cv_results_['std_test_F1'][cv.best_index_]}")
print(f"The corresponding acc score: {cv.cv_results_['mean_test_Acc'][cv.best_index_]}, the std: {cv.cv_results_['std_test_Acc'][cv.best_index_]}")
print(f"The best estimator params: {cv.best_params_}")


In [None]:
# linear SVC
X = all_discrete

parameters = {
    'penalty': ['l1', 'l2'],
    'loss': ['hinge', 'squared_hinge'],
    'C': [0.001, 0.01, 0.1, 1],
}

model = LinearSVC(
    class_weight='balanced',
    random_state=seed,
)

cv = GridSearchCV(model, parameters, n_jobs=-1, cv=5, scoring={'F1': 'f1', 'Acc': 'accuracy'}, refit='F1')
cv.fit(X, y)

print(f"The best F1 score: {cv.best_score_}, the std: {cv.cv_results_['std_test_F1'][cv.best_index_]}")
print(f"The corresponding acc score: {cv.cv_results_['mean_test_Acc'][cv.best_index_]}, the std: {cv.cv_results_['std_test_Acc'][cv.best_index_]}")
print(f"The best estimator params: {cv.best_params_}")


In [None]:
# KNN
X = all_discrete

parameters = {
    'n_neighbors': [3, 4, 5],
    'weights': ['uniform', 'distance'],
}

model = KNeighborsClassifier(
    
)

cv = GridSearchCV(model, parameters, n_jobs=-1, cv=5, scoring={'F1': 'f1', 'Acc': 'accuracy'}, refit='F1')
cv.fit(X, y)

print(f"The best F1 score: {cv.best_score_}, the std: {cv.cv_results_['std_test_F1'][cv.best_index_]}")
print(f"The corresponding acc score: {cv.cv_results_['mean_test_Acc'][cv.best_index_]}, the std: {cv.cv_results_['std_test_Acc'][cv.best_index_]}")
print(f"The best estimator params: {cv.best_params_}")


In [52]:
X = all_discrete

model = RandomForestClassifier(
    bootstrap=True,
    criterion='entropy',
    n_estimators=300,
    class_weight='balanced',
    random_state=seed,
)

model.fit(X, y)
prediction = model.predict(df_discrete_test_data.to_numpy())



In [53]:
print(len(prediction))
print(sum(prediction)/len(prediction))
print(sum(x for x in prediction if x == 1))

5034
0.17401668653158522
876


In [None]:
# random forest
X = all_discrete

parameters = {
}

model = RandomForestClassifier(
    bootstrap=True,
    criterion='entropy',
    n_estimators=300,
    class_weight='balanced',
    random_state=seed,
)

cv = GridSearchCV(model, parameters, n_jobs=-1, cv=10, scoring={'F1': 'f1', 'Acc': 'accuracy'}, refit='F1')
cv.fit(X, y)

print(f"The best F1 score: {cv.best_score_}, the std: {cv.cv_results_['std_test_F1'][cv.best_index_]}")
print(f"The corresponding acc score: {cv.cv_results_['mean_test_Acc'][cv.best_index_]}, the std: {cv.cv_results_['std_test_Acc'][cv.best_index_]}")
print(f"The best estimator params: {cv.best_params_}")

In [None]:
model = RandomForestClassifier(
    bootstrap=True,
    criterion='entropy',
    n_estimators=300,
    class_weight='balanced',
    random_state=seed,
)
model.fit(X, y)
model.score(X, y)

In [None]:
# extra tree
X = all_discrete

parameters = {
    'n_estimators': [100, 200, 300],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'bootstrap': [True, False],
    'oob_score': [True, False],
}

model = ExtraTreesClassifier(
    class_weight='balanced',
    random_state=seed,
)

cv = GridSearchCV(model, parameters, n_jobs=-1, cv=5, scoring={'F1': 'f1', 'Acc': 'accuracy'}, refit='F1')
cv.fit(X, y)

print(f"The best F1 score: {cv.best_score_}, the std: {cv.cv_results_['std_test_F1'][cv.best_index_]}")
print(f"The corresponding acc score: {cv.cv_results_['mean_test_Acc'][cv.best_index_]}, the std: {cv.cv_results_['std_test_Acc'][cv.best_index_]}")
print(f"The best estimator params: {cv.best_params_}")

In [None]:
# gradient boosting
X = all_discrete

parameters = {
    'loss': ['log_loss', 'exponential'],
    'learning_rate': [0.001, 0.01, 0.1],
    'n_estimators': [300],
    'criterion': ['friedman_mse', 'squared_error'],
    'max_features': ['sqrt', 'log2', None]
}

model = GradientBoostingClassifier(
    random_state=seed,
)

cv = GridSearchCV(model, parameters, n_jobs=-1, cv=5, scoring={'F1': 'f1', 'Acc': 'accuracy'}, refit='F1')
cv.fit(X, y)

print(f"The best F1 score: {cv.best_score_}, the std: {cv.cv_results_['std_test_F1'][cv.best_index_]}")
print(f"The corresponding acc score: {cv.cv_results_['mean_test_Acc'][cv.best_index_]}, the std: {cv.cv_results_['std_test_Acc'][cv.best_index_]}")
print(f"The best estimator params: {cv.best_params_}")

In [None]:
# xgb
X = all_discrete

parameters = {
    'booster': ['gbtree', 'dart'],
    'eta': [0.1, 0.3, 0.5],
    'objective': ['binary:logistic', 'binary:logitraw'],
}

model = XGBClassifier(
    seed=seed,
)

cv = GridSearchCV(model, parameters, n_jobs=-1, cv=5, scoring={'F1': 'f1', 'Acc': 'accuracy'}, refit='F1')
cv.fit(X, y)

print(f"The best F1 score: {cv.best_score_}, the std: {cv.cv_results_['std_test_F1'][cv.best_index_]}")
print(f"The corresponding acc score: {cv.cv_results_['mean_test_Acc'][cv.best_index_]}, the std: {cv.cv_results_['std_test_Acc'][cv.best_index_]}")
print(f"The best estimator params: {cv.best_params_}")

In [None]:
# Binomial
X = all_discrete

parameters = {

}

model = BernoulliNB(
    
)

cv = GridSearchCV(model, parameters, n_jobs=-1, cv=5, scoring={'F1': 'f1', 'Acc': 'accuracy'}, refit='F1')
cv.fit(X, y)

print(f"The best F1 score: {cv.best_score_}, the std: {cv.cv_results_['std_test_F1'][cv.best_index_]}")
print(f"The corresponding acc score: {cv.cv_results_['mean_test_Acc'][cv.best_index_]}, the std: {cv.cv_results_['std_test_Acc'][cv.best_index_]}")
print(f"The best estimator params: {cv.best_params_}")

In [None]:
# linear discriminant
X = all_discrete

parameters = {
    'solver': ['svd', 'lsqr', 'eigen'],
    'shrinkage': ['auto', None],
}

model = LinearDiscriminantAnalysis(
    
)

cv = GridSearchCV(model, parameters, n_jobs=-1, cv=5, scoring={'F1': 'f1', 'Acc': 'accuracy'}, refit='F1')
cv.fit(X, y)

print(f"The best F1 score: {cv.best_score_}, the std: {cv.cv_results_['std_test_F1'][cv.best_index_]}")
print(f"The corresponding acc score: {cv.cv_results_['mean_test_Acc'][cv.best_index_]}, the std: {cv.cv_results_['std_test_Acc'][cv.best_index_]}")
print(f"The best estimator params: {cv.best_params_}")

In [None]:
# quadratic discriminant
X = all_discrete

parameters = {

}

model = QuadraticDiscriminantAnalysis(
    
)

cv = GridSearchCV(model, parameters, n_jobs=-1, cv=5, scoring={'F1': 'f1', 'Acc': 'accuracy'}, refit='F1')
cv.fit(X, y)

print(f"The best F1 score: {cv.best_score_}, the std: {cv.cv_results_['std_test_F1'][cv.best_index_]}")
print(f"The corresponding acc score: {cv.cv_results_['mean_test_Acc'][cv.best_index_]}, the std: {cv.cv_results_['std_test_Acc'][cv.best_index_]}")
print(f"The best estimator params: {cv.best_params_}")

In [None]:
# deep learning
X = all_discrete

# parameters = {
#     'activation': ['logistic', 'relu'],
#     'solver': ['lbfgs', 'adam', 'sgd'],
#     'batch_size': [32, 64, 128],
#     'learning_rate': ['adaptive'],
#     'learning_rate_init': [0.0001, 0.001, 0.01],
#     'max_iter': [200],
#     'hidden_layer_sizes': [(64, 32)],
# }

parameters = {
    'activation': ['relu'],
    'solver': ['adam'],
    'batch_size': [32],
    'learning_rate': ['adaptive'],
    'learning_rate_init': [0.001],
    'max_iter': [200],
    'hidden_layer_sizes': [(64, 32)],
}


model = MLPClassifier(
    random_state=seed,
)

cv = GridSearchCV(model, parameters, n_jobs=-1, cv=5, scoring={'F1': 'f1', 'Acc': 'accuracy'}, refit='F1')
cv.fit(X, y)

print(f"The best F1 score: {cv.best_score_}, the std: {cv.cv_results_['std_test_F1'][cv.best_index_]}")
print(f"The corresponding acc score: {cv.cv_results_['mean_test_Acc'][cv.best_index_]}, the std: {cv.cv_results_['std_test_Acc'][cv.best_index_]}")
print(f"The best estimator params: {cv.best_params_}")

In [None]:
model = MLPClassifier(
    activation='relu',
    solver='adam',
    batch_size=32,
    learning_rate='adaptive',
    learning_rate_init=0.001,
    max_iter=1000,
    hidden_layer_sizes=(64)
)

model.fit(X_train,y_train)
model.score(X_validation, y_validation)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from ipywidgets import interact
import ipywidgets as widgets

# Define the function to plot the sine wave
def plot_sine_wave(frequency):
    x = np.linspace(0, 2 * np.pi, 1000)  # X-axis range
    y = np.sin(frequency * x)            # Sine wave with variable frequency
    
    plt.figure(figsize=(8, 4))
    plt.plot(x, y, label=f'Sine wave with frequency {frequency} Hz')
    plt.xlabel('X')
    plt.ylabel('sin(X)')
    plt.title('Sine Wave with Adjustable Frequency')
    plt.ylim(-1, 1)
    plt.legend()
    plt.show()

# Use ipywidgets' interact function to create a slider
interact(plot_sine_wave, frequency=widgets.FloatSlider(value=1, min=0.1, max=10.0, step=0.1));


interactive(children=(FloatSlider(value=1.0, description='frequency', max=10.0, min=0.1), Output()), _dom_clas…