In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Load the data and do some quick analysis.

In [None]:
import pandas as pd
all_data = pd.read_csv('data_archive.zip')
all_data

In [None]:
all_data['type'].value_counts(normalize=False)

The dataset is heavily unbalanced. So, I decided to first attempt classification for the two most frequent classes only: INTJ and INTP.

In [None]:
# data = all_data[all_data['type'].isin(['INTP', 'INTJ'])]
num_records_of_each_type = 100
intp_data = all_data[all_data['type'].isin(['INTP'])].head(num_records_of_each_type)
intj_data = all_data[all_data['type'].isin(['INTJ'])].head(num_records_of_each_type)
data = pd.concat([intp_data, intj_data])
data
data['type'].value_counts()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(data['posts'], data['type'], test_size=0.1)
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
import neptune.new as neptune
import mlflow
import sys, json
from joblib import dump, load

neptune_project = neptune.init_project(name="tapadipti/mtbi", api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI2MWI2YzlkMS0zNjBlLTQ1NjEtYmUxNS05MDI1ZGMyNDA1ODAifQ==")

neptune_run = neptune.init(
    project="tapadipti/mtbi",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI2MWI2YzlkMS0zNjBlLTQ1NjEtYmUxNS05MDI1ZGMyNDA1ODAifQ==",
    source_files=["mtbi_prediction_task.ipynb"]
)

mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Users/tapadipti@gmail.com/mtbi_personality_types")
mlflow.sklearn.autolog(disable=True)
mlflow.start_run()

with open("params.json") as f:
    all_params = json.load(f)
params = all_params["lr"]

lr = LogisticRegression(**params)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred))

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

with open("metrics.json", "w") as f:
    f.write(json.dumps({"accuracy": accuracy}))
dump(lr, "model.joblib")

neptune_run["parameters"] = params
neptune_run["accuracy"] = accuracy

neptune_run["accuracy_log"].log(accuracy)

neptune_run["parameters"]
neptune_run["accuracy"]


mlflow.log_param("penalty", params["penalty"])
mlflow.log_metric("accuracy", accuracy)
# mlflow.log_metric("accuracy", accuracy+0.01)
# mlflow.log_metric("accuracy", accuracy+0.02)
from urllib.parse import urlparse
tracking_uri_scheme = urlparse(mlflow.get_tracking_uri()).scheme
# if tracking_uri_scheme != "file":
mlflow.sklearn.log_model(lr, "model")
# else:
#     mlflow.sklearn.log_model(lr, "model")


from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

neptune_project["general/source_code"].upload("mtbi_prediction_task.ipynb")
neptune_project["dataset/v0.1"].track_files("./data_archive.zip")

neptune_run["notebook_code"].track_files("mtbi_prediction_task.ipynb")
neptune_run["train_dataset"].track_files("./data_archive.zip")


import matplotlib.pyplot as plt
import numpy as np
# import seaborn as sns

# ax = sns.heatmap(cm, cmap="Blues", annot=True, fmt="d")

# ax.set_title('Confusion Matrix');
# ax.set_xlabel('Predicted Type')
# ax.set_ylabel('Actual Type');

# ax.xaxis.set_ticklabels(['INTJ','INTP'])
# ax.yaxis.set_ticklabels(['INTJ','INTP'])

# plt.show()

fig, ax = plt.subplots()
plt.title('Confusion Matrix')
plt.xlabel('Predicted Types')
plt.ylabel('Actual Types')

ax.matshow(cm, cmap="Blues", alpha=0.2)

mtbi_types = ['INTJ', 'INTP']
tick_positions = np.arange(len(mtbi_types))
ax.xaxis.set_ticks(tick_positions)
ax.xaxis.set_ticklabels(mtbi_types)
ax.yaxis.set_ticks(tick_positions)
ax.yaxis.set_ticklabels(mtbi_types)

for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i,s=cm[i, j], va='center', ha='center', size='large')

plt.show()

neptune_run["confusion_matrix"].upload(fig)
neptune_run.stop()

mlflow.end_run()

In [None]:
mlflow.end_run()

In [None]:


# neptune_run["precision"] = {"INTJ": 0.92, "INTP": 0.92}
# neptune_run["recall"] = {"INTJ": 0.91, "INTP": 0.93}
# neptune_run["f1-score"] = {"INTJ": 0.91, "INTP": 0.93}


# neptune_project["general/brief"] = "URL_TO_PROJECT_BRIEF"

# project["dataset/latest"] = project["dataset/v0.1"].fetch()
# project = neptune.init_project("tapadipti/mtbi", mode="read-only")
# run["dataset"] = project["dataset/v0.1"].fetch()
# run["dataset"].download()



In [None]:
print(1)