In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import PolynomialFeatures
import category_encoders as ce
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, BatchNormalization
from tensorflow.keras.models import Model
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.optimizers import RMSprop
from keras.callbacks import LearningRateScheduler

2024-02-18 16:37:25.881788: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
signdata = pd.read_csv('/Users/emilkoch/Library/Mobile Documents/com~apple~CloudDocs/Data Files/signdata.csv', encoding='latin-1')

In [68]:
# Separate target variable from features
X = signdata.drop(columns=['SignBankEnglishTranslations'])  # Features

# Separate numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

print("Numerical Columns:", numerical_cols)
print("Categorical Columns:", categorical_cols)

Numerical Columns: ['List', 'Item', 'EnglishWF(lg10)', 'SignFrequency(M)', 'SignFrequency(SD)', 'SignFrequency(Z)', 'SignFrequency(N)', 'Unknown', 'SignFrequency(M-Native)', 'SignFrequency(SD-Native)', 'SignFreq(Z-native)', 'SignFrequency(N-Native)', 'Unknown(Native)', 'SignFrequency(M-Nonnative)', 'SignFrequency(SD-Nonnative)', 'SignFrequency(N-Nonnative)', 'SignFreq(Z-Nonnative)', 'Unknown(Nonnative)', 'DominantTranslationAgreement', 'DominantTranslationAgreement(Native)', 'DominantTranslationAgreement(Nonnative)', 'Iconicity(M)', 'Iconicity(SD)', 'Iconicity(Z)', 'Iconicity(N)', 'D.Iconicity(M)', 'D.Iconicity(SD)', 'D.Iconicity(N)', 'D.Iconicity(Z)', 'D.Iconicity(M-native)', 'D.Iconicity(SD-native)', 'D.Iconicity(Z-native)', 'D.Iconicity(N-native)', 'GuessConsistency', 'GuessAccuracy', 'Transparency(M)', 'Transparency SD', 'Transparency Z', 'Initialized.2.0', 'FingerspelledLoanSign.2.0', 'Compound.2.0', 'NumberOfMorphemes.2.0', 'SignOnset(ms)', 'SignOffset(ms)', 'SignDuration(ms)', '

In [69]:
# Preprocessing for numerical features
numerical_imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

# Copy numerical columns
X_numerical = X[numerical_cols].copy()
print(len(X_numerical))
print(len(numerical_cols))
print(X_numerical.head())
print(X_numerical.isnull().sum())

1984
129
   List  Item  EnglishWF(lg10)  SignFrequency(M)  SignFrequency(SD)  \
0     1     2            3.521             5.143              2.081   
1     1     3            4.645             6.032              1.516   
2     1     4            2.600             4.429              1.720   
3     1     5            2.928             2.621              1.720   
4     1     8            3.041             1.579              0.838   

   SignFrequency(Z)  SignFrequency(N)  Unknown  SignFrequency(M-Native)  \
0             0.621                21    0.000                    5.167   
1             1.068                31    0.000                    6.111   
2             0.232                21    0.000                    4.167   
3            -0.753                29    0.065                    2.000   
4            -1.198                19    0.095                    1.455   

   SignFrequency(SD-Native)  ...  ThumbContact.2.0Frequency  \
0                     2.167  ...                  

In [70]:
# Impute missing values and scaling
imputer = SimpleImputer(strategy='median')
X_numerical_imputed = imputer.fit_transform(X_numerical) 
scaler = StandardScaler()
X_numerical_scaled  = scaler.fit_transform(X_numerical_imputed)  

 'SpreadChangeM5.2.0' 'SignTypeM5.2.0' 'MovementM5.2.0'
 'RepeatedMovementM5.2.0' 'MajorLocationM5.2.0' 'MinorLocationM5.2.0'
 'SecondMinorLocationM5.2.0' 'ContactM5.2.0' 'NonDominantHandshapeM5.2.0'
 'UlnarRotationM5.2.0' 'MarkedHandshapeM6.2.0' 'FlexionChangeM6.2.0'
 'SpreadM6.2.0' 'SpreadChangeM6.2.0' 'ThumbContactM6.2.0' 'SignTypeM6.2.0'
 'MovementM6.2.0' 'RepeatedMovementM6.2.0' 'MajorLocationM6.2.0'
 'MinorLocationM6.2.0' 'SecondMinorLocationM6.2.0' 'ContactM6.2.0'
 'NonDominantHandshapeM6.2.0' 'UlnarRotationM6.2.0']. At least one non-missing value is needed for imputation with strategy='median'.


In [77]:
categorical_imputer = SimpleImputer(strategy='most_frequent', add_indicator=False)
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Copy categorical columns
X_categorical = X[categorical_cols].copy()

X_categorical_imputed = categorical_imputer.fit_transform(X_categorical)

# Encode categorical features
encoded_cols = pd.DataFrame(encoder.fit_transform(X_categorical))
encoded_cols.columns = encoder.get_feature_names_out(categorical_cols)
categorical_cols_encoded = encoded_cols.columns.tolist()



In [79]:
# Concatenate numerical and encoded categorical columns
X_processed = pd.concat([pd.DataFrame(X_numerical_scaled), encoded_cols], axis=1)

In [102]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Initialize SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')

# Fill missing values in the target variable
y_imputed = imputer.fit_transform(signdata[['SignBankEnglishTranslations']])

# Convert the NumPy array back to a pandas Series
y_imputed = pd.Series(y_imputed.flatten())

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the target variable
y_encoded = label_encoder.fit_transform(y_imputed)

# Check for NaN values in the target variable after imputation
nan_count_after_impute = pd.Series(y_imputed).isnull().sum()
print("Number of NaN values in 'SignBankEnglishTranslations' column after imputation:", nan_count_after_impute)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_encoded, test_size=0.2, random_state=42)




Number of NaN values in 'SignBankEnglishTranslations' column after imputation: 0


In [115]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply label encoding to y_train
y_train_encoded = label_encoder.fit_transform(y_train)

# Check the unique values and shape of y_train_encoded
print("Unique values in y_train_encoded:", np.unique(y_train_encoded))
print("Shape of y_train_encoded:", y_train_encoded.shape)

Unique values in y_train_encoded: [   0    1    2 ... 1467 1468 1469]
Shape of y_train_encoded: (1587,)


In [95]:
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import LearningRateScheduler
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Define the decreasing learning rate schedule
def decreasing_schedule(epoch):
    return 0.001 * np.exp(-0.1 * epoch)

def train_autoencoder(X_train, X_test):
    # Define the autoencoder architecture
    input_dim = X_train.shape[1]
    encoding_dim = 64  # Adjust as needed

    input_layer = Input(shape=(input_dim,))
    encoder_layer1 = Dense(128, activation='relu')(input_layer)
    encoder_layer1 = BatchNormalization()(encoder_layer1)
    encoder_layer1 = Dropout(0.5)(encoder_layer1)

    encoder_layer2 = Dense(encoding_dim, activation='relu')(encoder_layer1)
    encoder_layer2 = BatchNormalization()(encoder_layer2)
    encoder_layer2 = Dropout(0.5)(encoder_layer2)

    decoder_layer1 = Dense(128, activation='relu')(encoder_layer2)
    decoder_layer1 = BatchNormalization()(decoder_layer1)

    decoder_layer2 = Dense(input_dim, activation='sigmoid')(decoder_layer1)  # Adjusted output dimensionality
    decoder_layer2 = Dropout(0.5)(decoder_layer2)

    autoencoder = Model(input_layer, decoder_layer2)

    # Define the optimizer with RMSprop
    optimizer = RMSprop(learning_rate=0.001) 

    # Compile the autoencoder model with RMSprop optimizer
    autoencoder.compile(optimizer=optimizer, loss='mean_squared_error')

    # Define the learning rate scheduler callback
    lr_scheduler = LearningRateScheduler(decreasing_schedule)

    # Train the autoencoder with learning rate scheduler
    autoencoder.fit(X_train, X_train, epochs=50, batch_size=32, shuffle=True, 
                    validation_data=(X_test, X_test), callbacks=[lr_scheduler])

    # Extract features using the encoder part of the autoencoder
    encoder = Model(input_layer, encoder_layer2)
    X_encoded_test = encoder.predict(X_test)

    # Reconstruct data using the trained autoencoder
    reconstructed_data = autoencoder.predict(X_test)

    # Combine original test data with reconstructed data
    X_test_combined = np.concatenate((X_test, reconstructed_data), axis=1)

    # Compute cosine similarity between original and reconstructed data samples
    cosine_similarities = cosine_similarity(X_test_combined)

    # Calculate the mean cosine similarity across all samples
    mean_cosine_similarity = np.mean(cosine_similarities)
    
    return mean_cosine_similarity, encoder_layer2, input_layer

In [106]:
y_subset = y_encoded[:len(X_processed)]

# Check the shape of the subsetted y
print("Shape of y_subset:", y_subset.shape)

# Now, both X_processed and y_subset should have the same number of samples
# You can proceed with splitting them into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_subset, test_size=0.2, random_state=42)

Shape of y_subset: (1984,)


In [97]:
# Train the autoencoder with the decreasing learning rate schedule and RMSprop optimizer
similarity, encoder_layer2, input_layer = train_autoencoder(X_train, X_test)
print("Mean Cosine Similarity:", similarity)

# Extract features using the encoder part of the autoencoder
encoder = Model(input_layer, encoder_layer2)
X_encoded_train = encoder.predict(X_train)
X_encoded_test = encoder.predict(X_test)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Mean Cosine Similarity: 0.9349847123544542


In [116]:
# Step 1: Train the autoencoder and extract encoded features
mean_cosine_similarity, encoder_layer2, input_layer = train_autoencoder(X_train, X_test)

# Extract encoded features using the encoder part of the autoencoder
encoder = Model(input_layer, encoder_layer2)
X_encoded_train = encoder.predict(X_train)
X_encoded_test = encoder.predict(X_test)

# Train the model using the encoded features and target variable
model.fit(X_encoded_train, y_train_encoded)

# Make predictions on the test set
y_pred = model.predict(X_encoded_test)

# Evaluate the model performance (e.g., accuracy)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [114]:
# Step 1: Check Unique Values in y_train
unique_values_y_train = np.unique(y_train)
print("Unique values in y_train:", unique_values_y_train)

# Step 2: Compare with Expected Classes
expected_classes = np.arange(len(unique_values_y_train))
print("Expected classes based on unique values:", expected_classes)

Unique values in y_train: [   0    1    2 ... 1799 1800 1802]
Expected classes based on unique values: [   0    1    2 ... 1467 1468 1469]
