In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import PolynomialFeatures
import category_encoders as ce
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
signdata = pd.read_csv('/Users/emilkoch/Library/Mobile Documents/com~apple~CloudDocs/Data Files/signdata.csv', encoding='latin-1')

In [17]:
# Separate target variable from features
X = signdata.drop(columns=['SignBankEnglishTranslations'])  # Features
y = signdata['SignBankEnglishTranslations']

# Separate numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

print("Numerical Columns:", numerical_cols)
print("Categorical Columns:", categorical_cols)

Numerical Columns: ['List', 'Item', 'EnglishWF(lg10)', 'SignFrequency(M)', 'SignFrequency(SD)', 'SignFrequency(Z)', 'SignFrequency(N)', 'Unknown', 'SignFrequency(M-Native)', 'SignFrequency(SD-Native)', 'SignFreq(Z-native)', 'SignFrequency(N-Native)', 'Unknown(Native)', 'SignFrequency(M-Nonnative)', 'SignFrequency(SD-Nonnative)', 'SignFrequency(N-Nonnative)', 'SignFreq(Z-Nonnative)', 'Unknown(Nonnative)', 'DominantTranslationAgreement', 'DominantTranslationAgreement(Native)', 'DominantTranslationAgreement(Nonnative)', 'Iconicity(M)', 'Iconicity(SD)', 'Iconicity(Z)', 'Iconicity(N)', 'D.Iconicity(M)', 'D.Iconicity(SD)', 'D.Iconicity(N)', 'D.Iconicity(Z)', 'D.Iconicity(M-native)', 'D.Iconicity(SD-native)', 'D.Iconicity(Z-native)', 'D.Iconicity(N-native)', 'GuessConsistency', 'GuessAccuracy', 'Transparency(M)', 'Transparency SD', 'Transparency Z', 'Initialized.2.0', 'FingerspelledLoanSign.2.0', 'Compound.2.0', 'NumberOfMorphemes.2.0', 'SignOnset(ms)', 'SignOffset(ms)', 'SignDuration(ms)', '

In [4]:
# Preprocessing for numerical features
numerical_imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

# Copy numerical columns
X_numerical = X[numerical_cols].copy()
print(len(X_numerical))
print(len(numerical_cols))
print(X_numerical.head())
print(X_numerical.isnull().sum())

2723
129
   List  Item  EnglishWF(lg10)  SignFrequency(M)  SignFrequency(SD)  \
0     1     2            3.521             5.143              2.081   
1     1     3            4.645             6.032              1.516   
2     1     4            2.600             4.429              1.720   
3     1     5            2.928             2.621              1.720   
4     1     8            3.041             1.579              0.838   

   SignFrequency(Z)  SignFrequency(N)  Unknown  SignFrequency(M-Native)  \
0             0.621                21    0.000                    5.167   
1             1.068                31    0.000                    6.111   
2             0.232                21    0.000                    4.167   
3            -0.753                29    0.065                    2.000   
4            -1.198                19    0.095                    1.455   

   SignFrequency(SD-Native)  ...  ThumbContact.2.0Frequency  \
0                     2.167  ...                  

In [5]:
# Impute missing values and scaling
imputer = SimpleImputer(strategy='median')
X_numerical_imputed = imputer.fit_transform(X_numerical) 
scaler = StandardScaler()
X_numerical_scaled  = scaler.fit_transform(X_numerical_imputed)  

 'SignTypeM5.2.0' 'MovementM5.2.0' 'RepeatedMovementM5.2.0'
 'MajorLocationM5.2.0' 'MinorLocationM5.2.0' 'SecondMinorLocationM5.2.0'
 'ContactM5.2.0' 'NonDominantHandshapeM5.2.0' 'UlnarRotationM5.2.0'
 'FlexionChangeM6.2.0' 'SpreadChangeM6.2.0' 'SignTypeM6.2.0'
 'MovementM6.2.0' 'RepeatedMovementM6.2.0' 'MajorLocationM6.2.0'
 'MinorLocationM6.2.0' 'SecondMinorLocationM6.2.0' 'ContactM6.2.0'
 'NonDominantHandshapeM6.2.0' 'UlnarRotationM6.2.0']. At least one non-missing value is needed for imputation with strategy='median'.


In [6]:
# Preprocessing for categorical features
categorical_imputer = SimpleImputer(strategy='most_frequent')
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Copy categorical columns
X_categorical = X[categorical_cols].copy()

# Impute missing values in categorical columns
X_categorical = pd.DataFrame(categorical_imputer.fit_transform(X_categorical), columns=X_categorical.columns)

# Encode categorical features
encoded_cols = pd.DataFrame(encoder.fit_transform(X_categorical))
encoded_cols.columns = encoder.get_feature_names_out(categorical_cols)
categorical_cols_encoded = encoded_cols.columns.tolist()



In [7]:
# Concatenate numerical and encoded categorical columns
X_processed = pd.concat([pd.DataFrame(X_numerical_scaled), encoded_cols], axis=1)

In [22]:
# Check for NaN values in the target variable
nan_count = signdata['SignBankEnglishTranslations'].isnull().sum()
print("Number of NaN values in 'SignBankEnglishTranslations' column:", nan_count)

# Initialize SimpleImputer
imputer = SimpleImputer(strategy='most_frequent')

# Apply SimpleImputer to fill missing values in the target variable
filled_values = imputer.fit_transform(signdata[['SignBankEnglishTranslations']])
y_imputed = filled_values.flatten()  # Flatten the 2D array to 1D before assigning back to the Series

# Check for NaN values in the target variable after imputation
nan_count_after_impute = pd.Series(y_imputed).isnull().sum()
print("Number of NaN values in 'SignBankEnglishTranslations' column after imputation:", nan_count_after_impute)
print(y_imputed.dtype)

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply label encoding to the target variable 'SignBankEnglishTranslations'
y_encoded = label_encoder.fit_transform(y_imputed)
print(y_encoded.dtype)

Number of NaN values in 'SignBankEnglishTranslations' column: 739
Number of NaN values in 'SignBankEnglishTranslations' column after imputation: 0
object
int64


In [56]:
# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier()

# Convert feature names to strings
X_processed.columns = [str(col) for col in X_processed.columns]

# Fit the classifier to your data
rf_classifier.fit(X_processed, y_encoded)

# Get feature importances
feature_importances = rf_classifier.feature_importances_

# Filter features based on the threshold
selected_features = X_processed.columns[feature_importances > 0.01]
print(selected_features.isna().sum())

0


In [52]:
# Calculate pairwise correlation between 'SignBankEnglishTranslations' and other numerical columns
correlation_with_target = X_processed.corrwith(pd.Series(y_encoded))

# Sort correlations in descending order
correlation_with_target_sorted = correlation_with_target.abs().sort_values(ascending=False)

# Print the correlation values
print("Pairwise Correlation with 'SignBankEnglishTranslations':")
print(correlation_with_target_sorted)

# Fill NaN values with 0
X_processed_filled = X_processed.fillna(0)

# Filter correlations greater than 0.01
significant_correlations = correlation_with_target_sorted[correlation_with_target_sorted.abs() > 0.01]

# Create a new DataFrame to store significant correlations
significant_correlations_df = pd.DataFrame({'Feature': significant_correlations.index, 'Correlation': significant_correlations.values})

# Print the new DataFrame
print("Significant Correlations (> 0.01):")
print(significant_correlations_df)
print(significant_correlations_df.isna().sum())

Pairwise Correlation with 'SignBankEnglishTranslations':
SignBankLemmaID_DOG                   0.201446
SignBankAnnotationID_SHOUT            0.196797
SignBankSemanticField_Animal          0.158351
CDISemanticCategory_Animals           0.119098
15                                    0.105201
                                        ...   
MovementM4.2.0_Straight                    NaN
SecondMinorLocationM4.2.0_HandAway         NaN
HandshapeM6.2.0_r                          NaN
SelectedFingersM6.2.0_im                   NaN
FlexionM6.2.0_Crossed                      NaN
Length: 14060, dtype: float64
Significant Correlations (> 0.01):
                           Feature  Correlation
0              SignBankLemmaID_DOG     0.201446
1       SignBankAnnotationID_SHOUT     0.196797
2     SignBankSemanticField_Animal     0.158351
3      CDISemanticCategory_Animals     0.119098
4                               15     0.105201
...                            ...          ...
8188    SignBankAnnotatio

In [26]:
# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier()

# Convert feature names to strings
X_processed.columns = [str(col) for col in X_processed.columns]

# Fit the classifier to your data
rf_classifier.fit(X_processed, y_encoded)

# Get feature importances
feature_importances = rf_classifier.feature_importances_

# Filter features based on the threshold
selected_features_rf = X_processed.columns[feature_importances > 0.01]

# Convert significant correlations to a set for efficient comparison
significant_correlations_set = set(significant_correlations_df['Feature'])

# Convert selected features by random forest to a set for efficient comparison
selected_features_rf_set = set(selected_features_rf)

# Find common features between significant correlations and random forest selected features
common_features = significant_correlations_set.intersection(selected_features_rf_set)

# Print common features
print("Common Features Selected by Random Forest and Significant Correlations (> 0.01):")
print(common_features)

Common Features Selected by Random Forest and Significant Correlations (> 0.01):
{'SignBankLemmaID_DOG', '103', 'SignBankAnnotationID_SHOUT'}


In [57]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define X_train_corr and X_test_corr using the features from significant_correlations_df
X_train_corr = X_train[significant_correlations_df['Feature']]
X_test_corr = X_test[significant_correlations_df['Feature']]

# Define X_train_rf and X_test_rf using the selected features
X_train_rf = X_train[selected_features]
X_test_rf = X_test[selected_features]

# Train Random Forest using significant correlations
rf_corr_classifier = RandomForestClassifier(random_state=42)
rf_corr_classifier.fit(X_train_corr, y_train)

# Train Random Forest using randomly mixed features
rf_rf_classifier = RandomForestClassifier(random_state=42)
rf_rf_classifier.fit(X_train_rf, y_train)

# Predictions
y_pred_corr = rf_corr_classifier.predict(X_test_corr)
y_pred_rf = rf_rf_classifier.predict(X_test_rf)

# Evaluate performance
accuracy_corr = accuracy_score(y_test, y_pred_corr)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

print("Accuracy using features with significant correlations:", accuracy_corr)
print("Accuracy using randomly mixed features:", accuracy_rf)

Accuracy using features with significant correlations: 0.28440366972477066
Accuracy using randomly mixed features: 0.3467889908256881
