# TF-IDF and Linear Regression Demo / PoC
https://stats.stackexchange.com/questions/286125/how-to-handle-text-data-in-regression

This is a proof of concept page for the model

In [1]:
import random
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge

In [2]:
# Import the data to a df
train = pd.read_csv('data/siop_ml_train_participant.csv')

# Limit the results to a single answer / score combination for demonstration
train = train.drop(['Respondent_ID', 'open_ended_2', 'open_ended_3', 'open_ended_4', 'open_ended_5', 
         'A_Scale_score', 'O_Scale_score', 'C_Scale_score', 'N_Scale_score'], axis=1)

# Confirm that the data has been imported and is formatted correctly
train.head(3)

Unnamed: 0,open_ended_1,E_Scale_score
0,"I would change my vacation week, because I am ...",2.25
1,I would talk to my colleague and see if they w...,4.666667
2,I would feel upset because perhaps I already b...,2.25


In [3]:
def simple_prep (df, column):
    # Lowercase it all
    df[column].str.lower()
    
    # Remove non-alphanumeric characters
    df[column].replace('[^a-zA-Z0-9]', ' ', regex = True)
    
    return df

prepped_data = simple_prep (train, 'open_ended_1')

In [4]:
def vectorize_training_data (df, column):
    # Set the TF-IDF vectorization settings
    vectorizer = TfidfVectorizer(min_df=5)
    
    # Convert text into vectors
    X = vectorizer.fit_transform(df[column]) 
    
    # return the vectorizer object so we can use it later for evaluation
    return X_tfidf, vectorizer
    
X, vectorizer = vectorize_training_data (prepped_data, 'open_ended_1')

In [5]:
# Model
clf = Ridge(alpha=1.0, random_state=241)

# Set the criterion
y = train['E_Scale_score']

# train model on data
clf.fit(X, y) 

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=241, solver='auto', tol=0.001)

In [6]:
def see_sample (row_num, vectorizer):
    print ('User input for row [{0}]: '.format(row_num))
    print (train['open_ended_1'][row_num])
    
    print ('\nActual Score: ')
    print (train['E_Scale_score'][row_num])
    sample_test_data = vectorizer.transform([train['open_ended_1'][row_num]]) 
    rslt = clf.predict(sample_test_data)
    
    print ('\nPredicted Score: ')
    print (rslt[0])

# To see results for a specific row, change this value to a row index
test_row_index = random.randint(0, len(train['open_ended_1']))

see_sample (test_row_index, vectorizer)

User input for row [63]: 
I would probably not change my plans for vacation because it would cause everyone in my family to have to change their plans as well if the plans and reservations, as well as time requested off of work have already been approved by their supervisors and other arrangements made. I would explain that my family has already made arrangements with their employers, schools, etc. This would be something very important to me, because I do not make last minute vacation plans. I plan well in advance.

Actual Score: 
3.0833333332999997

Predicted Score: 
3.374346670834718
