In [None]:
import re
from datetime import datetime

def months_difference(date1: str, date2: str) -> int:
    date1 = re.sub(r"-", "/", date1)
    date2 = re.sub(r"-", "/", date2)
    # Convert string dates to datetime objects
    d1 = datetime.strptime(date1, "%Y/%m/%d")
    d2 = datetime.strptime(date2, "%Y/%m/%d")

    # Calculate the difference in years and months
    year_diff = d2.year - d1.year
    month_diff = d2.month - d1.month

    # Total months difference
    total_months = year_diff * 12 + month_diff

    return total_months

In [None]:
import csv

# File paths
input_csv = '/content/trajectory_love.csv'  # Input CSV file path
output_csv = 'output_AO3.csv'  # Output CSV file path

# Initialize a dictionary to hold data grouped by title
title_data = {}

# Read the input CSV
with open(input_csv, 'r', newline='', encoding='utf-8') as infile:
    reader = csv.DictReader(infile)

    # Group rows by title and accumulate keywords and romantic categories
    for row in reader:
        title = row['id']

        if title not in title_data:
            # Initialize the title's entry with the first row's data
            title_data[title] = {
                'id': row['id'],
                'kudos': row['kudos'],
                'title': row['title'],
                'romanticCategory': set([row['romanticCategory']]),  # Start with a list containing the first romantic category
                'rating': row['rating'],
                'contentWarning': row['contentWarning'],
                'words': row['words'],
                'packaged': row['packaged'],
                'published': row['published'],
                'keywords': set([row['keyword']])  # Start with a list containing the first keyword
            }
        else:
            # Add the current row's keyword to the existing list for this title
            title_data[title]['keywords'].add(row['keyword'])
            # Add the current row's romantic category to the existing list for this title
            if row['romanticCategory'] not in title_data[title]['romanticCategory']:
                title_data[title]['romanticCategory'].add(row['romanticCategory'])

# Write the result to a new CSV
with open(output_csv, 'w', newline='', encoding='utf-8') as outfile:
    # Define the fieldnames
    fieldnames = ['id', 'kudos', 'title', 'keywords', 'amount_keywords', 'romanticCategory', 'amount_romanticCategory', 'rating', 'contentWarning', 'words', 'packaged', 'published', "up_time"]
    writer = csv.DictWriter(outfile, fieldnames=fieldnames)

    writer.writeheader()

    # Write each title with its associated data and list of keywords
    for title, data in title_data.items():
        writer.writerow({
            'id': data['id'],
            'kudos': data['kudos'],
            'title': data['title'],
            'keywords': ', '.join(data['keywords']),  # Convert the list back to a string of keywords
            'amount_keywords': len(data['keywords']),
            'romanticCategory': ', '.join(data['romanticCategory']),  # Convert the list back to a string of romantic categories
            'amount_romanticCategory': len(data['romanticCategory']),
            'rating': data['rating'],
            'contentWarning': data['contentWarning'],
            'words': data['words'],
            'packaged': data['packaged'],
            'published': data['published'],
            'up_time': months_difference(data['published'], data['packaged'][:-9])
        })

In [None]:
!pip install gensim



In [None]:
import pandas as pd
from gensim.models import Word2Vec

def create_word2vec(df, columns, vector_size=100, window=5, min_count=1):
    # Create Word2Vec representations for each column in the list
    for column in columns:
        # Tokenize the column (assuming each row is a string)
        tokenized_column = df[column].apply(lambda x: str(x).split())

        # Train Word2Vec model for this column
        model = Word2Vec(sentences=tokenized_column, vector_size=vector_size, window=window, min_count=min_count)

        # Replace the original column with Word2Vec vector representations
        df[column] = tokenized_column.apply(lambda tokens: model.wv[tokens].mean(axis=0) if tokens else [0] * vector_size)

    return df

df = pd.read_csv('/content/output_AO3.csv')
columns_to_transform = ['keywords', 'rating', 'contentWarning', 'romanticCategory']  # Update with your column names

# Create Word2Vec representations and overwrite columns
df_transformed = create_word2vec(df, columns_to_transform)
df.to_csv('word2vec_representation_overwritten.csv', index=False)

print(df_transformed.head())

         id  kudos                                      title  \
0  27042589     11                                    Bridges   
1  13681566    177                       Entirely by Accident   
2  13791840     18                     Nobody's Second Choice   
3  14301855    294  An Unusual Request and One Hell of a Deal   
4  15128201    180                                  won't he?   

                                            keywords  amount_keywords  \
0  [-0.01621489, 0.019760534, 0.01753785, 0.00889...                7   
1  [-0.03754358, 0.04080964, 0.039701052, 0.01872...                5   
2  [-0.062019784, 0.062396917, 0.05589096, 0.0271...                2   
3  [-0.025074823, 0.024605813, 0.023887193, 0.011...               19   
4  [-0.019761443, 0.020010946, 0.020477148, 0.008...               14   

                                    romanticCategory  amount_romanticCategory  \
0  [-0.004352052, 0.0034430432, -0.00083210086, 0...                        2   
1  [-0.0

In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('/content/output_AO3.csv')

# Keywords to search for (case-sensitive)
keywords_list = ["Enemies to Lovers", "e2l", "enemies to lovers", "E2L"]

# Initialize a dictionary to store the counts for each keyword
keyword_counts = {keyword: 0 for keyword in keywords_list}

# Go through the 'keywords' column and count occurrences of each keyword
for entry in df['keywords']:
    for keyword in keywords_list:
        # Count the occurrences of each keyword in the current entry
        keyword_counts[keyword] += entry.count(keyword)

# Print the results
for keyword, count in keyword_counts.items():
    print(f"'{keyword}': {count} occurrences")

# Count the total number of entries (rows) in the DataFrame
print(len(df))

'Enemies to Lovers': 94 occurrences
'e2l': 0 occurrences
'enemies to lovers': 0 occurrences
'E2L': 0 occurrences
364


In [None]:
tf_idf = pd.read_csv('/content/word2vec_representation_overwritten.csv')


import pandas as pd
import re


def convert_to_float_list(string):
    # Remove unwanted characters (brackets, newline) using regex
    clean_string = re.sub(r'[\[\]\n]', '', string)

    # Split the string into components
    string_list = clean_string.split()

    # Convert each component to a float
    float_list = [float(num) for num in string_list]

    return float_list

def convert_columns_to_word2vec(df, columns):
    # Loop through the specified columns
    for column in columns:
        # Apply the conversion to each row in the column
        df[column] = df[column].apply(lambda row: convert_to_float_list(row))

    return df

# Specify the columns to convert
columns_to_convert = ['keywords', 'rating', 'contentWarning', 'romanticCategory']

# Apply the conversion to the specified columns
df_transformed = convert_columns_to_word2vec(tf_idf, columns_to_convert)

df_transformed['keywords_mean'] = df_transformed['keywords'].apply(np.mean)
df_transformed['keywords_max'] = df_transformed['keywords'].apply(np.max)
df_transformed['keywords_min'] = df_transformed['keywords'].apply(np.min)

df_transformed['rating_mean'] = df_transformed['rating'].apply(np.mean)
df_transformed['rating_max'] = df_transformed['rating'].apply(np.max)
df_transformed['rating_min'] = df_transformed['rating'].apply(np.min)

df_transformed['contentWarning_mean'] = df_transformed['contentWarning'].apply(np.mean)
df_transformed['contentWarning_max'] = df_transformed['contentWarning'].apply(np.max)
df_transformed['contentWarning_min'] = df_transformed['contentWarning'].apply(np.min)

df_transformed['romanticCategory_mean'] = df_transformed['romanticCategory'].apply(np.mean)
df_transformed['romanticCategory_max'] = df_transformed['romanticCategory'].apply(np.max)
df_transformed['romanticCategory_min'] = df_transformed['romanticCategory'].apply(np.min)

df_transformed = df_transformed.iloc[:, 13:]

print(df_transformed.head())

   keywords_mean  keywords_max  keywords_min  rating_mean  rating_max  \
0       0.002877      0.117595     -0.114768    -0.000008    0.009923   
1       0.006483      0.271132     -0.268012    -0.000377    0.008188   
2       0.009143      0.381607     -0.374412     0.000394    0.009061   
3       0.004051      0.168580     -0.164746     0.000157    0.009619   
4       0.003227      0.132687     -0.131532     0.000394    0.009061   

0   -0.009122             0.000171            0.010017           -0.010721   
1   -0.009075             0.000456            0.008414           -0.007713   
2   -0.009255             0.000456            0.008414           -0.007713   
3   -0.009604             0.000456            0.008414           -0.007713   
4   -0.009255             0.000018            0.011680           -0.010382   

   romanticCategory_mean  romanticCategory_max  romanticCategory_min  
0               0.000333              0.008749             -0.009150  
1               0.000157    

In [None]:
print(df.head())

         id  kudos                                      title  \
0  27042589     11                                    Bridges   
1  13681566    177                       Entirely by Accident   
2  13791840     18                     Nobody's Second Choice   
3  14301855    294  An Unusual Request and One Hell of a Deal   
4  15128201    180                                  won't he?   

                                            keywords  amount_keywords  \
0  Period-Typical Language, A little (implicit) C...                7   
1  Implied Sexual Content, Sexual Tension, Fluff ...                5   
2   Friends to Lovers, Alternate Universe - Hogwarts                2   
3  More angst than i originally intended, Origina...               19   
4  very cute, Fluff, summer at the Burrow, lot's ...               14   

  romanticCategory  amount_romanticCategory             rating  \
0         M/M, F/M                        2             Mature   
1              M/M                    

In [None]:
data = pd.concat([df.iloc[:, [1,4,6,9,12]], df_transformed], axis=1)
print(data.head())

   kudos  amount_keywords  amount_romanticCategory  words  up_time  \
0     11                7                        2  15011       18   
1    177                5                        1  15251       47   
2     18                2                        1    443       47   
3    294               19                        1   7461       45   
4    180               14                        1   9164       43   

   keywords_mean  keywords_max  keywords_min  rating_mean  rating_max  \
0       0.002877      0.117595     -0.114768    -0.000008    0.009923   
1       0.006483      0.271132     -0.268012    -0.000377    0.008188   
2       0.009143      0.381607     -0.374412     0.000394    0.009061   
3       0.004051      0.168580     -0.164746     0.000157    0.009619   
4       0.003227      0.132687     -0.131532     0.000394    0.009061   

0   -0.009122             0.000171            0.010017           -0.010721   
1   -0.009075             0.000456            0.008414        

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

def make_pred(data):
    # Assuming `data` is already defined
    x = data.iloc[:, 1:].values
    y = data.iloc[:, 0].values
    y = y.reshape(len(y), 1)

    # Splitting the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

    # Standardizing the features
    sc_x = StandardScaler()
    sc_y = StandardScaler()

    # Fit on training data and transform
    X_train = sc_x.fit_transform(X_train)
    X_test = sc_x.transform(X_test)  # Use the same scaler to transform the test data

    # Fit the target variable scaler only on the training target
    y_train = sc_y.fit_transform(y_train)
    y_test = sc_y.transform(y_test)  # Transform the test target

    # Reshape y_train if necessary
    y_train = y_train.ravel()

    # Creating and fitting the SVR model
    regressor = SVR(kernel='rbf', epsilon=0.001, gamma="auto")
    regressor.fit(X_train, y_train)

    # Making predictions on the test set
    y_pred = regressor.predict(X_test)

    # Inverse transform predictions and test values to original scale
    y_pred_inv = sc_y.inverse_transform(y_pred.reshape(-1, 1)).ravel()
    y_test_inv = sc_y.inverse_transform(y_test.reshape(-1, 1)).ravel()

    # Calculating metrics
    mse = mean_squared_error(y_test_inv, y_pred_inv)
    rmse = root_mean_squared_error(y_test_inv, y_pred_inv)

    #print("Mean Squared Error:", round(mse, 2))
    #print("Root Mean Squared Error:", round(rmse, 2), "\n")

    # Printing actual vs predicted values
    #for actual, predicted in zip(y_test_inv, y_pred_inv):
    #    print("Actual value:", actual, "Predicted value:", predicted, "Difference:", actual - predicted)
    return mse, rmse

mse_lst = []
rmse_lst = []

for i in range(200):
    mse, rmse = make_pred(data)
    mse_lst.append(mse)
    rmse_lst.append(rmse)

print("Mean Squared Error:", round(np.mean(mse_lst), 2))
print("Root Mean Squared Error:", round(np.mean(rmse_lst), 2))



Mean Squared Error: 624303.06
Root Mean Squared Error: 727.83
