# Loading the Data

In [None]:
import pandas as pd

df_birth_year = pd.read_csv("../data/birth_year.csv", delimiter = ",")

df_birth_year

In [None]:
df_extrovert_introvert = pd.read_csv("../data/extrovert_introvert.csv")

df_extrovert_introvert

In [None]:
df_feeling_thinking = pd.read_csv("../data/feeling_thinking.csv")

df_feeling_thinking

In [None]:
df_gender = pd.read_csv("../data/gender.csv")

df_gender

In [None]:
df_judging_perceiving = pd.read_csv("../data/judging_perceiving.csv")

df_judging_perceiving

In [None]:
df_nationality = pd.read_csv("../data/nationality.csv")

df_nationality

In [None]:
df_political_leaning = pd.read_csv("../data/political_leaning.csv")

df_political_leaning

In [None]:
df_sensing_intuitive = pd.read_csv("../data/sensing_intuitive.csv")

df_sensing_intuitive

# Scrubber on Gender Dataset

In [None]:
%pip install scrubadub-spacy -q

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import scrubadub_spacy, scrubadub
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

def scrubber(df):
    scrubber = scrubadub.Scrubber()
    
    # Add the SpacyEntityDetector with the loaded model
    scrubber.add_detector(scrubadub_spacy.detectors.SpacyEntityDetector(model="en_core_web_sm"))

    for index, row in df.iterrows():
        text = row['post']
        result = scrubber.clean(text)
        df.at[index, 'post'] = result

    return df

df_gender = scrubber(df_gender)

df_gender

# Gender Swapping on Scrubbed Dataset

In [None]:
def gender_swap(df):
    def change_gender(string):
        # A Dictionary to store the mapping of genders
        # The user can add his words too.
        dictionary = {
            "batman": "batwoman", "batwoman": "batman",
            "boy": "girl", "girl": "boy",
            "boyfriend": "girlfriend", "girlfriend": "boyfriend",
            "father": "mother", "mother": "father",
            "husband": "wife", "wife": "husband",
            "he": "she", "she": "he",
            "his": "her", "her": "his",
            "male": "female", "female": "male",
            "man": "woman", "woman": "man",
            "Mr": "Ms", "Ms": "Mr",
            "sir": "madam", "madam": "sir",
            "son": "daughter", "daughter": "son",
            "uncle": "aunt", "aunt": "uncle",
        }
     
        string += ' '  # Append a space at the end
     
        n = len(string)
     
        # 'temp' string will hold the intermediate words
        # and 'ans' string will be our result
        temp = ""
        ans = ""
     
        for i in range(n):
            if string[i] != ' ':
                temp += string[i]
            else:
                # If this is a 'male' or a 'female' word then
                # swap this with its counterpart
                if temp in dictionary:
                    temp = dictionary[temp]
     
                ans += temp + ' '
                temp = ""
     
        return ans

    for index, row in df.iterrows():
        df.at[index, 'post'] = change_gender(row['post'])

    return df

df_gender = gender_swap(df_gender)

df_gender

In [6]:
df_gender.to_csv("./data/df_gender_augmented.csv", index=False)

# Probabilistic Classifier on Augemented Data

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import numpy as np
from scipy.spatial import distance

df_gender_augmented = pd.read_csv("../data/df_gender_augmented.csv")

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df_gender_augmented['post'])
y = df_gender_augmented['female']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def train_svm(X_train, y_train, X_test, y_test):
    # Train a Support Vector Machine model
    model = SVC(kernel='linear')
    model.fit(X_train, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test)
    print(f"SVM Accuracy: {accuracy_score(y_test, y_pred)}")

    return model

def train_naive_bayes(X_train, y_train, X_test, y_test):
    # Train a Naive Bayes model
    model = MultinomialNB()
    model.fit(X_train, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test)
    print(f"Naive Bayes Accuracy: {accuracy_score(y_test, y_pred)}")

    return model


def train_knn(X_train, y_train, X_test, y_test, k=5):
    # Store the complete training matrix in memory
    X_train = X_train.toarray()
    X_test = X_test.toarray()
    
    def knn_predict(X_train, y_train, x, k):
        # Calculate distance metric between x and all X_train
        dists = distance.cdist([x], X_train, 'euclidean')[0]
        
        # Choose the vectors from X_train with the highest similarity to x
        nearest_indices = np.argsort(dists)[:k]
        
        # Look up the labels for these vectors, take majority label
        nearest_labels = y_train[nearest_indices]
        majority_label = np.bincount(nearest_labels).argmax()
        
        return majority_label
    
    # Evaluate the model
    y_pred = np.array([knn_predict(X_train, y_train, x, k) for x in X_test])
    print(f"KNN Accuracy: {accuracy_score(y_test, y_pred)}")
    
    return y_pred

# Train models
svm_model = train_svm(X_train, y_train, X_test, y_test)
nb_model = train_naive_bayes(X_train, y_train, X_test, y_test)
knn_model = train_knn(X_train, y_train, X_test, y_test)

# Test the SVM model with a custom input
test_input = ["hey girl, you got this!"]
test_input_vectorized = vectorizer.transform(test_input)
svm_prediction = svm_model.predict(test_input_vectorized)

# Check if the SVM model detects a female
print(f"SVM Prediction for 'hey girl, you got this!': {svm_prediction[0]}")


