# Natural Language Processing Research

There are many fields on dating app profiles which the user writes themselves, therefore creating a language processing problem. My idea is to classify preferences using a Bag-of-words model.

In [34]:
import pandas as pd
import json
import re

f = open('data/bumble.json')
profiles = json.load(f)

# Function to transform the array of objects into a DataFrame
def transform_data_to_dataframe(data):
    # Prepare an empty list to collect rows
    rows = []
    
    # Iterate through each object in the data
    for obj in data:
        # Extract the required information
        row = {
            'liked': obj['liked'],
            'spotify': ' '.join(obj['spotifyArtists'])
        }
        
        # Flatten the attributes into individual columns
        for attr in obj['storyContent']:
            if 'About' in attr['heading']:
                # this heading is about <person_name> needs to be handled separately
                row['About']= attr['content']
            else:
                row[attr['heading']]= attr['content']
            
        # Append the row to the list of rows
        rows.append(row)
    
    # Create a DataFrame from the rows
    return pd.DataFrame(rows)

df = transform_data_to_dataframe(profiles)
df.head()
df.to_csv('data/test.csv')

Now lets combine the text from all the columns

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
import re

def preprocess_text(text):
    # combine text if it's not NaN, then lowercase and remove punctuation
    text = ' '.join([str(item) for item in text if pd.notnull(item)]).lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

features = df.drop(columns=['liked'])

preprocessed_texts = features.apply(preprocess_text, axis=1)

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(preprocessed_texts)

feature_names = vectorizer.get_feature_names_out()

(X, feature_names[:10])

(<200x1224 sparse matrix of type '<class 'numpy.int64'>'
 	with 2453 stored elements in Compressed Sparse Row format>,
 array(['13', '1970s', '1975', '1st', '3000', '4th', '5year', '70s',
        '70s80s90s00s', '73'], dtype=object))

Now lets actually train a model for this

In [37]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

y = df['liked']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = SVC()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

       False       0.90      1.00      0.95        18
        True       0.00      0.00      0.00         2

    accuracy                           0.90        20
   macro avg       0.45      0.50      0.47        20
weighted avg       0.81      0.90      0.85        20



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
