# Mixed Weighted Model

This approach will use a regression model for each input item, and then weight the outputs of these models based on their scored criterion relevance from SME's.


In [1]:
import random
import pandas as pd
import numpy as np
import scipy as sp

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge

In [2]:
# Import the data to a df
df = pd.read_csv('data/siop_ml_train_participant.csv')

# Import the test data to a new df
eval_df = pd.read_csv('data/siop_ml_dev_participant.csv')

# Import the weighting matrix
# weights = pd.read_csv('data/weights/mvp_matrix.csv')
weights = pd.read_csv('data/weights/dummy_matrix_no_impact.csv')

# Since we want to select a specific index later for the weights, we'll set the df index here
weighting_matrix = weights.set_index('sj_item_name')

# Confirm that the data has been imported and is formatted correctly
df.head(3)


Unnamed: 0,Respondent_ID,open_ended_1,open_ended_2,open_ended_3,open_ended_4,open_ended_5,E_Scale_score,A_Scale_score,O_Scale_score,C_Scale_score,N_Scale_score
0,10446116527,"I would change my vacation week, because I am ...",I would reach out to my boss and ask him or he...,I would not go. I am a not a social person. I ...,I would ask my manager why he/she gave me such...,I would find this experience super enjoyable. ...,2.25,3.75,3.166667,3.75,2.916667
1,10440100535,I would talk to my colleague and see if they w...,I would continue to work on the project that w...,I would talk to my colleague and try to talk t...,I would feel upset about the negative feedback...,I would find this experience enjoyable. I feel...,4.666667,4.416667,4.583333,5.0,1.333333
2,10462850071,I would feel upset because perhaps I already b...,I would start working on the project now and g...,I would feel guilty about thinking about not g...,I would feel really defensive about it. I woul...,I would find it enjoyable because I would be r...,2.25,4.75,4.083333,4.666667,2.166667


In [3]:
eval_df.head(3)

Unnamed: 0,Respondent_ID,open_ended_1,open_ended_2,open_ended_3,open_ended_4,open_ended_5
0,10460010474,I would look into changing my vacation plans t...,I would work on the project little by little d...,I would probably still go. Just depending on h...,I would see what I could to do to improve the ...,I would absolutely enjoy being involved in thi...
1,10440103178,"I have always been a team player, but this wou...",I would first address my concerns with my boss...,I would be all in. While accompaniment would b...,I definitely would not be happy about this sit...,I would absolutely find this experience enjoya...
2,10440099430,I would try to come to a compromise with my co...,I would go to my boss and ask him if he has an...,"I would go to the event, it's possible that if...",I would pay attention and take an honest look ...,I would find it enjoyable because I love learn...


In [4]:
weighting_matrix.head(3)

Unnamed: 0_level_0,E_Scale_score,A_Scale_score,O_Scale_score,C_Scale_score,N_Scale_score
sj_item_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
open_ended_1,1,1,1,1,1
open_ended_2,1,1,1,1,1
open_ended_3,1,1,1,1,1


For clarity and simplicity, the training and criterion columns are declared in these hardcoded variables

In [5]:
training_columns = ['open_ended_1', 'open_ended_2', 'open_ended_3', 'open_ended_4', 'open_ended_5']
criterion_columns = ['E_Scale_score', 'A_Scale_score', 'O_Scale_score', 'C_Scale_score', 'N_Scale_score']


In [6]:
def simple_prep (df):
    for col in training_columns:
        
        # Lowercase it all
        df[col].str.lower()
    
        # Remove non-alphanumeric characters
        df[col].replace('[^a-zA-Z0-9]', ' ', regex = True)
    
    return df

prepped_data = simple_prep (df)


In [7]:
# Generate our testing and training sets and show their relative sizes
train, test = train_test_split(prepped_data, test_size=0.01)


### Transformation and Training

First we'll need to defind the function for vectorizing each input column.

In [8]:
# Abstract away as much as possible so we can reuse this general vectorizing and training function
def vectorize_and_train (df_train, y_train):
    
    # Set the TF-IDF vectorization settings
    vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1,5))
    
    # Convert text into vectors
    X_train = vectorizer.fit_transform(df_train) 
    
    # We're using the same y-values from the training df
    y_train = y_train
    
    # Generate a new model instance
    mod = new_model()
    
    # Fit our model with the data
    mod.fit(X_train, y_train)
    
    # return the vectorizer object so we can use it later for evaluation
    return X_train, y_train, vectorizer, mod


# Allow a new model to be initialized for each column / regressor
def new_model ():
    # Define the model parameters we'll be using
    mod = Ridge(alpha=1.0, random_state=42)
    return mod


In [9]:
# Define our model sets
X_training_set = {}    
y_training_set = {}
vectorizer_set = {}
model_set = {}

# Iterate over the training columns
for training_col in training_columns:
    
    # Generate a new dict for each column so we can access the models later
    X_training_set[training_col] = {}
    y_training_set[training_col] = {}
    vectorizer_set[training_col] = {}
    model_set[training_col] = {}
    
    # Within the training columns, iterate over the outcome variables
    for criterion_col in criterion_columns:    
        X_training_set[training_col][criterion_col], y_training_set[training_col][criterion_col], vectorizer_set[training_col][criterion_col], model_set[training_col][criterion_col] = vectorize_and_train (train[training_col], train[criterion_col])


### Transforming and Predicting on Input Data

Since we've set up the nested model structures, we can move ahead and process the input file we received at the beginning of the notebook.

In [10]:
# Iterate over our outcome variables
for ec_column in criterion_columns:

    # Predict a value from a model trained on each column
    for sj_column in training_columns:

        # Transform the evaluation data into vectorized data on the correct vectorizer vocab
        eval_transformed = vectorizer_set[sj_column][ec_column].transform(eval_df[sj_column])

        # Predict the values with the corresponding model
        y_pred = model_set[sj_column][ec_column].predict(eval_transformed)

        # Multiply by our determined weights
        # WHATS THE RIGHT WAY TO INCORPORATE WEIGHT HERE?
        y_pred_weighted = y_pred * weighting_matrix.loc[sj_column][ec_column]

    # Average the weighted voting results and assign them to the criterion columns in the results
    eval_df[ec_column] = np.mean( np.array([ y_pred_weighted ]), axis=0 )


In [21]:
# # Generate a Dataframe from the results
# output = pd.concat([eval_df["Respondent_ID"].reset_index(drop=True), results.reset_index(drop=True)], axis=1)

# Drop the training columns so we report on only Respondent ID and their predicted values
output = eval_df.drop(columns=training_columns) 

# Set the headers asked for by the challenge organizers
correct_headers = ['Respondent_ID', 'E_Pred', 'A_Pred', 'O_Pred','C_Pred', 'N_Pred']

# Show the current column titles to ensure they're in the correct order
print('Replace column names from \n{0} to:\n{1}'.format(list(output), correct_headers))

# Rename our column headers so they match what folks expect
output.columns = [correct_headers]

# Send the output frame to a CSV, and exclude the indices with index=False
output.to_csv('data/example_output2.csv', encoding='utf-8', index=False)

Replace column names from 
['Respondent_ID', 'E_Scale_score', 'A_Scale_score', 'O_Scale_score', 'C_Scale_score', 'N_Scale_score'] to:
['Respondent_ID', 'E_Pred', 'A_Pred', 'O_Pred', 'C_Pred', 'N_Pred']


### Model Testing and Evaluation

Now that we've got our models trained, we'll want to evaluate their performance on test data

In [12]:
# JK JK, I'm not actually doing that right now :sadpanda:

# If you want to modify this section to include test evaluation, this would be really helpful