In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

In [2]:
signdata = pd.read_csv('/Users/emilkoch/Library/Mobile Documents/com~apple~CloudDocs/Data Files/signdata.csv', 
                       usecols=['SignOnset(ms)', 'SignOffset(ms)', 'SignDuration(ms)', 
                                'CDISemanticCategory', 'SignBankSemanticField', 
                                'SignBankEnglishTranslations', 'EntryID', 'LemmaID', 
                                'GuessConsistency', 'GuessAccuracy', 'LexicalClass', 
                                'Handshape.2.0', 'Movement.2.0','Flexion.2.0','UlnarRotation.2.0', 'Iconicity(Z)'], 
                       encoding='latin-1') # Select numerical columns
numerical_cols = ['GuessConsistency', 'GuessAccuracy', 'SignOnset(ms)', 'SignOffset(ms)', 'SignDuration(ms)', 'UlnarRotation.2.0', 'Iconicity(Z)']

In [10]:
# Select numerical columns
numerical_cols = ['GuessConsistency', 'GuessAccuracy', 'SignOnset(ms)', 'SignOffset(ms)', 'SignDuration(ms)', 'UlnarRotation.2.0', 'Iconicity(Z)']

# Initialize SimpleImputer to fill missing values with median
imputer = SimpleImputer(strategy='median')

# Fill missing values and scale numerical features
numerical_data = signdata[numerical_cols].copy()  # Create a copy of numerical columns
numerical_data[numerical_cols] = imputer.fit_transform(numerical_data)  # Impute missing values
scaler = StandardScaler()
numerical_data[numerical_cols] = scaler.fit_transform(numerical_data)  # Scale numerical features

# Check the first few rows of the dataframe to verify the preprocessing
print(numerical_data.head())

# Select categorical columns
categorical_cols = ['EntryID', 'LemmaID', 'LexicalClass', 'Handshape.2.0', 'Movement.2.0',
                    'SignBankSemanticField','CDISemanticCategory', 'Flexion.2.0']

# Initialize SimpleImputer to fill missing values with most frequent value
imputer = SimpleImputer(strategy='most_frequent')

# Fill missing values in categorical columns
categorical_data = signdata[categorical_cols].copy()  # Create a copy of categorical columns
categorical_data[categorical_cols] = imputer.fit_transform(categorical_data)  # Impute missing values

# Initialize OneHotEncoder
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Encode categorical columns
encoded_cols = pd.DataFrame(encoder.fit_transform(categorical_data))
encoded_cols.columns = encoder.get_feature_names_out(categorical_cols)
# Update categorical_cols to include the column names after one-hot encoding
categorical_cols_encoded = encoded_cols.columns.tolist()

# Concatenate numerical and encoded categorical columns
X_signdata = pd.concat([numerical_data, encoded_cols], axis=1)

   GuessConsistency  GuessAccuracy  SignOnset(ms)  SignOffset(ms)  \
0          0.085438      -0.166756      -1.695099       -1.751844   
1          0.085438      -0.166756      -1.156618        0.017815   
2          0.085438      -0.166756      -3.567728       -1.127625   
3          0.085438      -0.166756      -1.421840       -1.751844   
4          0.085438      -0.166756      -1.960321       -0.918512   

   SignDuration(ms)  UlnarRotation.2.0  Iconicity(Z)  
0         -1.168372            2.25347      0.909325  
1          0.498569           -0.44376     -0.829772  
2          0.275199           -0.44376      0.234955  
3         -1.281723            2.25347     -1.340737  
4         -0.168207           -0.44376      0.301095  




In [12]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply label encoding to the 'SignBankEnglishTranslations' column
signdata['SignBankEnglishTranslations_encoded'] = label_encoder.fit_transform(signdata['SignBankEnglishTranslations'])

# Check for NaN values in 'SignBankEnglishTranslations' column
nan_count = signdata['SignBankEnglishTranslations'].isnull().sum()
print("Number of NaN values in 'SignBankEnglishTranslations' column:", nan_count)

Number of NaN values in 'SignBankEnglishTranslations' column: 0


In [13]:
from sklearn.preprocessing import PolynomialFeatures

# Initialize PolynomialFeatures
poly = PolynomialFeatures(degree=2, include_bias=False)

# Generate polynomial features for numerical variables
poly_features = poly.fit_transform(numerical_data[numerical_cols])

# Convert the polynomial features array into a DataFrame
poly_df = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(numerical_cols))

# Concatenate polynomial features with the original numerical data
numerical_data_poly = pd.concat([numerical_data, poly_df], axis=1)

In [14]:
# Create interaction terms between pairs of numerical variables
interaction_terms = numerical_data[numerical_cols].copy()
for i in range(len(numerical_cols)):
    for j in range(i+1, len(numerical_cols)):
        col_name = f'{numerical_cols[i]}_{numerical_cols[j]}_interaction'
        interaction_terms[col_name] = numerical_data[numerical_cols[i]] * numerical_data[numerical_cols[j]]

# Concatenate interaction terms with the original numerical data
numerical_data_interact = pd.concat([numerical_data, interaction_terms], axis=1)

In [16]:
import category_encoders as ce
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_signdata, signdata['SignBankEnglishTranslations_encoded'], test_size=0.2, random_state=42)

# Initialize target encoder with the updated categorical columns
target_encoder = ce.TargetEncoder(cols=categorical_cols_encoded)

# Fit and transform target encoder on the training data
X_train_encoded = target_encoder.fit_transform(X_train, y_train)

# Transform the testing data using the same target encoding mapping
X_test_encoded = target_encoder.transform(X_test)

In [26]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model to the training data
rf_classifier.fit(X_train_encoded, y_train)

# Predictions on the testing data
y_pred = rf_classifier.predict(X_test_encoded)

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.26788990825688075
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          18       0.00      0.00      0.00         0
          20       0.00      0.00      0.00         1
          21       0.00      0.00      0.00         0
          26       0.00      0.00      0.00         1
          27       0.00      0.00      0.00         1
          34       0.00      0.00      0.00         1
          36       0.00      0.00      0.00         1
          40       0.00      0.00      0.00         1
          43       0.00      0.00      0.00         1
          46       0.00     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
