# OK Cupid Date-A-Scientist

### A machine learning (ML) \/ natural language processing (NLP) portfolio project

This is a portfolio project summing up a ML part of Codecademy's "Data Scientist: Natural Language Processing Specialist" career path – covering both supervised (regressors, classifiers) and unsupervised (clusters, feature reductors) algorithms:
namely decision trees, random forests, K-nearest neighbors, K-mean cluster, support vector machines, principal component analysis and naive bayes.

I'll sample some analysing data provided by Codecademy / OK Cupid.

### Importing libraries and data exploration (EDA)

I'll start by importing libraries and modules we will use. For now pandas, matplotlib, and seaborn for data exploration.

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

We load the data and peek into it.

In [None]:
profiles = pd.read_csv("profiles.csv")
print(profiles.head())

In [None]:
profiles.columns

In [None]:
print(profiles.orientation.unique())

In [None]:
sns.countplot(y="body_type", data=profiles, palette="deep")
plt.show()

In [None]:
def plot_it(column):
    sns.countplot(y=column, data=profiles, palette="deep")
    plt.show()

for column in ['body_type', 'diet', 'drinks', 'drugs',
       'job', 'offspring',
       'orientation', 'pets', 'religion', 'sex', 'sign', 'smokes',
       'status']:
    plot_it(column)

In [None]:
profiles.dtypes

In [None]:
def make_labels(dataset):
    dataset_copy = dataset.copy()

    categoricals = dataset_copy.select_dtypes(include=['object']).columns
    
    for column in categoricals:
        labeler = LabelEncoder()
        dataset_copy[column] = labeler.fit_transform(dataset_copy[column])
    
    return dataset_copy

## Let's build some models

We'll start with some simple decision trees and random forest classifiers

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

def make_tree(df, target):
    X = df.drop(target, axis=1)
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

    tree = DecisionTreeClassifier(random_state = 42)
    tree.fit(X_train, y_train)

    predictions = tree.predict(X_test)

    print(f"weighted F1 score: {f1_score(y_test, predictions, average='weighted')}")
    print(f"macro F1 score: {f1_score(y_test, predictions, average='macro')}")
    print(f"None F1 score: {f1_score(y_test, predictions, average=None)}")

In [None]:
def make_forest(df, target):
    X = df.drop(target, axis=1)
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
    
    forest = RandomForestClassifier(class_weight = 'balanced', random_state = 42)
    forest.fit(X_train, y_train)

    predictions = forest.predict(X_test)

    print(f"weighted F1 score: {f1_score(y_test, predictions, average='weighted')}")
    print(f"macro F1 score: {f1_score(y_test, predictions, average='macro')}")
    print(f"None F1 score: {f1_score(y_test, predictions, average=None)}")

In [None]:
def make_gradient(df, target):
    X = df.drop(target, axis=1)
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
    
    xgb = XGBClassifier(
        scale_pos_weight=6,
        learning_rate=0.01,
        n_estimators=200,
        max_depth=100
    )

    xgb.fit(X_train, y_train)

    predictions = xgb.predict(X_test)

    print(f1_score(y_test, predictions, average = "weighted"))
    print(f1_score(y_test, predictions, average = "macro"))
    print(f1_score(y_test, predictions, average = None))

In [None]:
essays = ['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9']
df_wo_essays = profiles.drop(essays, axis=1).copy()
df = make_labels(df_wo_essays).dropna()
print(df.columns)
print(df.isna().sum())

In [None]:
make_tree(df, "sex")

In [None]:
make_forest(df, "sex")

In [None]:
make_gradient(df, "body_type")

Let's try some regressors.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def make_linear(df, target):
    X = df.drop(target, axis=1)
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

    regressor = LinearRegression()
    regressor.fit(X_train, y_train)

    predictions = regressor.predict(X_test)

    print(f"MSE: {mean_squared_error(y_test, predictions):.4f}")
    print(f"MAE: {mean_absolute_error(y_test, predictions):.4f}")
    print(f"R²: {r2_score(y_test, predictions):.4f}")

In [None]:
print(profiles.age.head(10))

age_counts = df['age'].value_counts().sort_index()
plt.figure(figsize=(10, 5))
sns.lineplot(x=age_counts.index, y=age_counts.values)
plt.title('Age Frequency Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.grid(True)
plt.show()

In [None]:
make_linear(df, "height")

Let's try some clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

def scale_it(df):

    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df)
    scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
    
    return scaled_df

In [None]:
df_scaled = scale_it(df)

In [None]:
def cluster_it(df):
    cluster = KMeans(n_clusters=3)

    cluster.fit(df_scaled)

    labels = model.predict(data_samples)