In [13]:
# Data management
import pandas as pd

# Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC,LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
import pickle
# Model Metrics
from sklearn.metrics import accuracy_score
from text_processing import tokenize_and_lemmatize
# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# from myFunctions import *
import warnings
import time
start= time.time()
warnings.filterwarnings('ignore')

In [29]:
# !pip install buildins

ERROR: Could not find a version that satisfies the requirement buildins
ERROR: No matching distribution found for buildins


In [14]:
# Load the data
df=pd.read_csv('../media/essays.csv', encoding='cp1252')
# Drop IS column 
df = df.drop(['#AUTHID'], axis=1)
# Rename columns
df.columns = ['posts', 'extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'openness']
# Main Loop to go through all columns and create models
cols = ['extraversion', 'neuroticism', 'agreeableness', 'conscientiousness', 'openness']

In [15]:
import contractions
from collections import Counter
import re
import nltk
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
# nltk.download('wordnet')
# nltk.download('punkt')
from nltk.stem import WordNetLemmatizer

def undersample_majority(df, cls_col):
    """ Function to equalize minority and majority classes"""

    counter = Counter(df[cls_col])
    ratio = min(counter.values()) / max(counter.values())
    #inv = (max(counter.values()) / min(counter.values()))
    #print(f'Minority Class: Majority Class Ratio = 1:{inv:.2f}')
    # Convert columns to arrays
    if ratio < 0.7:
        X = np.array(df['posts']).reshape(-1, 1)
        y = np.array(df[cls_col])
        # Undersample majority class to the minority class size
        under = RandomUnderSampler()
        X, y = under.fit_resample(X, y)
        print(Counter(y))
        df = pd.DataFrame({'posts': list(X.flatten()), cls_col: y}, columns=['posts', cls_col])
    else:
        df = df
    return df

# Function to remove links and symbols
def clear_text(data):

    cleaned_text=[]
    for sentence in data.posts:
        sentence=sentence.lower()

#       removing links from text data
        sentence=re.sub(r'http[s]?://\S+', '',sentence)

#       removing other symbols
        sentence=re.sub(r'@([a-zA-Z0-9_]{1,50})', '', sentence)
        sentence=re.sub(r'#([a-zA-Z0-9_]{1,50})', '', sentence)
        sentence=re.sub(r'[^A-Za-z]+', ' ', sentence)
        sentence=" ".join([word for word in sentence.split() if not len(word) < 3])

        cleaned_text.append(sentence)
    return cleaned_text

# Function to remove contractions
def fix_contractions(df, column_name = "posts"):
    df[column_name] = df[column_name].apply(lambda x: contractions.fix(x))
    return df

In [41]:
# # Lemmatization class
# class Lemmatizer(object):
#     def __init__(self):
#         self.lemmatizer = WordNetLemmatizer()
#     def __call__(self, sentence):
#         return [self.lemmatizer.lemmatize(word) for word in sentence.split() if len(word)>2]
# def tokenize_and_lemmatize(text):
#     lemmatizer = WordNetLemmatizer()
#     tokens = text.split()
#     lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
#     return lemmatized_tokens

In [16]:
for name in cols:

    data = df[[name, 'posts']].copy()
#    Split the data
#   Use stratify split to ensure equal distribution of data
    print('\033[35m' +  'Initializing training of '+ name + ' data' + '\033[0m')

    train_data,test_data=train_test_split(data,test_size=0.2,random_state=42,stratify=data[name])
    print('\033[1m' + 'Resampling imbalanced data' + '\033[0m')
#   Adjust imbalanced data 
    train_data = undersample_majority(train_data, name)
#   Remove contractions
    train_data = fix_contractions(train_data)
    test_data = fix_contractions(test_data)
#   Clean the data
    print(f'Cleaning {name} training data...')
    train_data.posts =clear_text(train_data)
    print(f'Cleaning {name} testing data...')
    test_data.posts =clear_text(test_data)
    print('Preparing classifiers...')
#   Vectorize the text
    vectorizer=TfidfVectorizer(max_features=5000,stop_words='english',tokenizer=tokenize_and_lemmatize)
    vectorizer.fit(train_data.posts)
    train_post=vectorizer.transform(train_data.posts).toarray()
    test_post=vectorizer.transform(test_data.posts).toarray()
#   Encode the labels
    target_encoder=LabelEncoder()
    train_target=target_encoder.fit_transform(train_data[name])
    test_target=target_encoder.transform(test_data[name])
#   Declare an empty dictionary to store model name and accuracy
    models_accuracy={}
#   Declare an empty list to store trained models
    models = []
#   Create classification object
    print(f'Running logistic regression...')
    model_log=LogisticRegression()
#   Train the model using the training set
    model_log.fit(train_post,train_target)
#   Add model accuracy of the testing set to the dictionary
    models_accuracy['logistic regression']=accuracy_score(test_target,model_log.predict(test_post))
#   Add the trained model to the list
    models.append(model_log)
    print(f'Running Linear Support Vector classifier...')
    model_linear_svc=LinearSVC()
    model_linear_svc.fit(train_post,train_target)
    models_accuracy['Linear Support Vector classifier']=accuracy_score(test_target,model_linear_svc.predict(test_post))
    models.append(model_linear_svc)
    print(f'Running Support Vector classifier...')
    model_svc=SVC()
    model_svc.fit(train_post,train_target)
    models_accuracy['Support Vector classifier']=accuracy_score(test_target,model_svc.predict(test_post))
    models.append(model_svc)
    print(f'Running Multinomial Naive Bayes...')
    model_multinomial_nb=MultinomialNB()
    model_multinomial_nb.fit(train_post,train_target)
    models_accuracy['Multinomial Naive Bayes']=accuracy_score(test_target,model_multinomial_nb.predict(test_post))
    models.append(model_multinomial_nb)
    print(f'Running Decision Tree classifier...')
    model_tree=DecisionTreeClassifier()
    model_tree.fit(train_post,train_target)
    models_accuracy['Decision Tree classifier']=accuracy_score(test_target,model_tree.predict(test_post))
    models.append(model_tree)
    print(f'Running Random Forest classifier...')
    model_forest=RandomForestClassifier()
    model_forest.fit(train_post,train_target)
    models_accuracy['Random Forest classifier']=accuracy_score(test_target,model_forest.predict(test_post))
    models.append(model_forest)
    print(f'Running XGBoost classifier...')
    model_xgb=XGBClassifier(gpu_id=-1)
    model_xgb.fit(train_post,train_target)
    models_accuracy['XGBoost classifier']=accuracy_score(test_target,model_xgb.predict(test_post))
    models.append(model_xgb)
    print(f'Running kNN classifier...')
    model_knn=KNeighborsClassifier()
    model_knn.fit(train_post,train_target)
    models_accuracy['kNN Classifier']=accuracy_score(test_target,model_knn.predict(test_post))
    models.append(model_knn)
    print('\033[93m' + 'Analyzing..' + '\033[0m')
#   Convert the dictionary to a dataframe
    accuracy=pd.DataFrame(models_accuracy.items(),columns=['Classifier','Accuracy'])
#   Append the models to the dataframe
    accuracy['Model'] = models
#   Re-order the dataframe based on model accuracy
    accuracy = accuracy.sort_values(by='Accuracy',ascending=False,ignore_index=True)
#   Save the top 5 models
    for i in range(5):
        pickle.dump(accuracy.Model[i], open('../media/'+name+'_model'+str(i)+'.sav', 'wb'))
        print(f'Saving {accuracy.Classifier[i]}')
#   Save vectorizer and encoder
    pickle.dump(vectorizer, open('../media/'+name+'_vectors.pickle', 'wb'))
    pickle.dump(target_encoder, open('../media/'+name+'_encoder.obj', 'wb'))
    # Add names of top 5 models to list
    top = list(accuracy.Classifier.head())
    acc = list(accuracy.Accuracy.head())
    avg = sum(acc)/len(acc)
    print(f'Best models to predict {name} are {top} with a mean accuracy of {avg:.2%}')
#   Timing runtime
tt = (((time.time()-start)/60))
print('\033[102m' +  f'Training completed successfully in {tt:.2f} Minutes' + '\033[0m')


[35mInitializing training of extraversion data[0m
[1mResampling imbalanced data[0m
Cleaning extraversion training data...
Cleaning extraversion testing data...
Preparing classifiers...
Running logistic regression...
Running Linear Support Vector classifier...
Running Support Vector classifier...
Running Multinomial Naive Bayes...
Running Decision Tree classifier...
Running Random Forest classifier...
Running XGBoost classifier...
Running kNN classifier...
[93mAnalyzing..[0m
Saving Support Vector classifier
Saving logistic regression
Saving Random Forest classifier
Saving Multinomial Naive Bayes
Saving Linear Support Vector classifier
Best models to predict extraversion are ['Support Vector classifier', 'logistic regression', 'Random Forest classifier', 'Multinomial Naive Bayes', 'Linear Support Vector classifier'] with a mean accuracy of 55.83%
[35mInitializing training of neuroticism data[0m
[1mResampling imbalanced data[0m
Cleaning neuroticism training data...
Cleaning neur

In [17]:
# Load the data
df = pd.read_csv('../media/mbti_1.csv')

#Function to split types in MBTI dataset
def divide_mbti_types(df):
    df["EI"] = df['type'].astype(str).str[0]
    df["NS"] = df['type'].astype(str).str[1]
    df["FT"] = df['type'].astype(str).str[2]
    df["JP"] = df['type'].astype(str).str[3]
    return df

# Divide columns
df = divide_mbti_types(df)

# Main Loop to go through all columns and create models
cols = ['EI', 'NS', 'FT', 'JP']

In [18]:
for name in cols:

    data = df[[name, 'posts']].copy()
#    Split the data
#   Use stratify split to ensure equal distribution of data
    print('\033[35m' + 'Initializing training of ' + name + ' data' + '\033[0m')

    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data[name])
    print('\033[1m' + 'Resampling imbalanced data' + '\033[0m')
#   Adjust imbalanced data 
    train_data = undersample_majority(train_data, name)
#   Remove contractions
    train_data = fix_contractions(train_data)
    test_data = fix_contractions(test_data)
#   Clean the data
    print(f'Cleaning {name} training data...')
    train_data.posts = clear_text(train_data)
    print(f'Cleaning {name} testing data...')
    test_data.posts = clear_text(test_data)
    print('Preparing classifiers...')
#   Vectorize the text
    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', tokenizer=tokenize_and_lemmatize)
    vectorizer.fit(train_data.posts)
    train_post = vectorizer.transform(train_data.posts).toarray()
    test_post = vectorizer.transform(test_data.posts).toarray()
#   Encode the labels
    target_encoder=LabelEncoder()
    train_target=target_encoder.fit_transform(train_data[name])
    test_target=target_encoder.transform(test_data[name])
#   Declare an empty dictionary to store model name and accuracy
    models_accuracy={}
#   Declare an empty list to store trained models
    models = []
#   Create classification object
    print(f'Running logistic regression...')
    model_log=LogisticRegression()
#   Train the model using the training set
    model_log.fit(train_post,train_target)
#   Add model accuracy of the testing set to the dictionary
    models_accuracy['logistic regression']=accuracy_score(test_target,model_log.predict(test_post))
#   Add the trained model to the list
    models.append(model_log)
    print(f'Running Linear Support Vector classifier...')
    model_linear_svc=LinearSVC()
    model_linear_svc.fit(train_post,train_target)
    models_accuracy['Linear Support Vector classifier']=accuracy_score(test_target,model_linear_svc.predict(test_post))
    models.append(model_linear_svc)
    print(f'Running Support Vector classifier...')
    model_svc=SVC()
    model_svc.fit(train_post,train_target)
    models_accuracy['Support Vector classifier']=accuracy_score(test_target,model_svc.predict(test_post))
    models.append(model_svc)
    print(f'Running Multinomial Naive Bayes...')
    model_multinomial_nb=MultinomialNB()
    model_multinomial_nb.fit(train_post,train_target)
    models_accuracy['Multinomial Naive Bayes']=accuracy_score(test_target,model_multinomial_nb.predict(test_post))
    models.append(model_multinomial_nb)
    print(f'Running Decision Tree classifier...')
    model_tree=DecisionTreeClassifier()
    model_tree.fit(train_post,train_target)
    models_accuracy['Decision Tree classifier']=accuracy_score(test_target,model_tree.predict(test_post))
    models.append(model_tree)
    print(f'Running Random Forest classifier...')
    model_forest=RandomForestClassifier()
    model_forest.fit(train_post,train_target)
    models_accuracy['Random Forest classifier']=accuracy_score(test_target,model_forest.predict(test_post))
    models.append(model_forest)
    print(f'Running XGBoost classifier...')
    model_xgb=XGBClassifier(gpu_id=-1)
    model_xgb.fit(train_post,train_target)
    models_accuracy['XGBoost classifier']=accuracy_score(test_target,model_xgb.predict(test_post))
    models.append(model_xgb)
    print(f'Running kNN classifier...')
    model_knn=KNeighborsClassifier()
    model_knn.fit(train_post,train_target)
    models_accuracy['kNN Classifier']=accuracy_score(test_target,model_knn.predict(test_post))
    models.append(model_knn)
    print('\033[93m' + 'Analyzing..' + '\033[0m')
#   Convert the dictionary to a dataframe
    accuracy=pd.DataFrame(models_accuracy.items(),columns=['Classifier','Accuracy'])
#   Append the models to the dataframe
    accuracy['Model'] = models
#   Re-order the dataframe based on model accuracy
    accuracy = accuracy.sort_values(by='Accuracy',ascending=False,ignore_index=True)
#   Save the top 5 models
    for i in range(5):
        pickle.dump(accuracy.Model[i], open('../media/'+name+'_model'+str(i)+'.sav', 'wb'))
        print(f'Saving {accuracy.Classifier[i]}')
#   Save vectorizer and encoder
    pickle.dump(vectorizer, open('../media/'+name+'_vectors.pickle', 'wb'))
    pickle.dump(target_encoder, open('../media/'+name+'_encoder.obj', 'wb'))
    # Add names of top 5 models to list
    top = list(accuracy.Classifier.head())
    acc = list(accuracy.Accuracy.head())
    avg = sum(acc)/len(acc)
    print(f'Best models to predict {name} are {top} with a mean accuracy of {avg:.2%}')
#   Timing runtime
tt = (((time.time()-start)/60))
print('\033[102m' +  f'Training completed successfully in {tt:.2f} Minutes' + '\033[0m')


[35mInitializing training of EI data[0m
[1mResampling imbalanced data[0m
Counter({'E': 1599, 'I': 1599})
Cleaning EI training data...
Cleaning EI testing data...
Preparing classifiers...
Running logistic regression...
Running Linear Support Vector classifier...
Running Support Vector classifier...
Running Multinomial Naive Bayes...
Running Decision Tree classifier...
Running Random Forest classifier...
Running XGBoost classifier...
Running kNN classifier...
[93mAnalyzing..[0m
Saving logistic regression
Saving Support Vector classifier
Saving XGBoost classifier
Saving Linear Support Vector classifier
Saving Multinomial Naive Bayes
Best models to predict EI are ['logistic regression', 'Support Vector classifier', 'XGBoost classifier', 'Linear Support Vector classifier', 'Multinomial Naive Bayes'] with a mean accuracy of 80.31%
[35mInitializing training of NS data[0m
[1mResampling imbalanced data[0m
Counter({'N': 958, 'S': 958})
Cleaning NS training data...
Cleaning NS testing d