# Mixed Weighted Model

This approach will use a regression model for each input item, and then weight the outputs of these models based on their scored criterion relevance from SME's.


In [1]:
import random
import pandas as pd
import numpy as np
import scipy as sp

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge

In [2]:
# Import the data to a df
df = pd.read_csv('data/siop_ml_train_participant.csv')

# Import the test data to a new df
eval_df = pd.read_csv('data/siop_ml_dev_participant.csv')

# Import the weighting matrix
weighting_matrix = pd.read_csv('data/weights/dummy_matrix.csv')

# Confirm that the data has been imported and is formatted correctly
df.head(3)


Unnamed: 0,Respondent_ID,open_ended_1,open_ended_2,open_ended_3,open_ended_4,open_ended_5,E_Scale_score,A_Scale_score,O_Scale_score,C_Scale_score,N_Scale_score
0,10446116527,"I would change my vacation week, because I am ...",I would reach out to my boss and ask him or he...,I would not go. I am a not a social person. I ...,I would ask my manager why he/she gave me such...,I would find this experience super enjoyable. ...,2.25,3.75,3.166667,3.75,2.916667
1,10440100535,I would talk to my colleague and see if they w...,I would continue to work on the project that w...,I would talk to my colleague and try to talk t...,I would feel upset about the negative feedback...,I would find this experience enjoyable. I feel...,4.666667,4.416667,4.583333,5.0,1.333333
2,10462850071,I would feel upset because perhaps I already b...,I would start working on the project now and g...,I would feel guilty about thinking about not g...,I would feel really defensive about it. I woul...,I would find it enjoyable because I would be r...,2.25,4.75,4.083333,4.666667,2.166667


For clarity and simplicity, the training and criterion columns are declared in these hardcoded variables

In [3]:
training_columns = ['open_ended_1', 'open_ended_2', 'open_ended_3', 'open_ended_4', 'open_ended_5']
criterion_columns = ['E_Scale_score', 'A_Scale_score', 'O_Scale_score', 'C_Scale_score', 'N_Scale_score']

In [4]:
def simple_prep (df):
    for col in training_columns:
        
        # Lowercase it all
        df[col].str.lower()
    
        # Remove non-alphanumeric characters
        df[col].replace('[^a-zA-Z0-9]', ' ', regex = True)
    
    return df

prepped_data = simple_prep (df)

In [5]:
# Generate our testing and training sets and show their relative sizes
train, test = train_test_split(prepped_data, test_size=0.01)

### Transformation and Training

First we'll need to defind the function for vectorizing each input column.

In [6]:
# Abstract away as much as possible so we can reuse this general vectorizing and training function
def vectorize_and_train (df_train, y_train):
    # Set the TF-IDF vectorization settings
    vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1,3))
    
    # Convert text into vectors
    X_train = vectorizer.fit_transform(df_train) 
    
    # We're using the same y-values from the training df
    y_train = y_train
    
    # Generate a new model instance
    mod = new_model()
    
    # Fit our model with the data
    mod.fit(X_train, y_train)
    
    # return the vectorizer object so we can use it later for evaluation
    return X_train, y_train, vectorizer, mod

# Allow a new model to be initialized for each column / regressor
def new_model ():
    # Define the model parameters we'll be using
    mod = Ridge(alpha=1.0, random_state=42)
    return mod

In [7]:
# Define our model sets
X_training_set = {}    
y_training_set = {}
vectorizer_set = {}
model_set = {}

# Iterate over the training columns
for training_col in training_columns:
    
    # Generate a new dict for each column so we can access the models later
    X_training_set[training_col] = {}
    y_training_set[training_col] = {}
    vectorizer_set[training_col] = {}
    model_set[training_col] = {}
    
    # Within the training columns, iterate over the outcome variables
    for criterion_col in criterion_columns:    
        X_training_set[training_col][criterion_col], y_training_set[training_col][criterion_col], vectorizer_set[training_col][criterion_col], model_set[training_col][criterion_col] = vectorize_and_train (train[training_col], train[criterion_col])


### Transforming and Predicting on Input Data

Since we've assessed that the model works and is make acceptable predictions, we can move ahead and process the input file we received at the beginning of the notebook.

In [8]:
# For each of the columns in the evaluation file, we 

# DEV NOTE iterate over the evaluation columns afterwards
# FOR NOW, just just evaluate a single column, open_ended_1
#for eval_col in assessment_df:
sj_column = 'open_ended_1'
ec_column = 'E_Scale_score'

# For each of the outcome variables, predict a value from this input column
# for ec_column in criterion_columns:

# Transform the evaluation data into vectorized data on the correct vectorizer vocab
eval_transformed = vectorizer_set[sj_column][ec_column].transform(eval_df[sj_column])

# Predict the values with the corresponding model
y_pred = model_set[sj_column][ec_column].predict(eval_transformed)



In [9]:
# Generate a Dataframe from the results
output = pd.DataFrame(data={
    "Respondent_ID":eval_df["Respondent_ID"],
    "Prediction":y_pred
    })

# Ensure the output is the form of a question.  "What is..."
output = output[["Respondent_ID", "Prediction"]]

# Send the output frame to a CSV, and exclude the indices with index=False
output.to_csv('data/wtf_are_we_doing.csv', encoding='utf-8', index=False)

### Model Testing and Evaluation

Now that we've got our models trained, we'll want to evaluate their performance on test data

In [10]:
# JK JK, I'm not actually doing that right now :sadpanda:

# If you want to modify this section to include test evaluation, this would be really helpful