# Basic Model Training

## Importing

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('clean_data.csv', sep=',')

### Series
The series data won't be much use in the wpca if we condense it, so instead we are going to make a dictionary so we can boost the simularity score later

In [2]:
# Fill the NaN values with an empty list
df['series'] = df['series'].fillna("")

# Split the series column by '-'
df['series'] = df['series'].str.split('-')

# Create an empty dictionary
series_dict = {}

# Loop through the series column
for series in df['series']:
    # Loop through each game id in the series
    for game_id in series:
        # Get the other games in the same series
        other_games = [x for x in series if x != game_id]
        # Add the game id and the other games to the dictionary
        series_dict[game_id] = other_games

# Print the series dictionary
#print(series_dict)

In [3]:
# Drop Columns
X = df.drop(['name'], axis=1)
ids = X['id']
X = X.drop(['id'], axis=1)
X = X.drop(['series'], axis=1).values
y = df['name'].values

## Reducing Dimensionality

The current data has way too many columns, and a bunch with 1.0 correlation. We want to simplify the data to make it easier to train and find more relelvant categories. 

##### Run one, not both. 

## Trimap

In [68]:
from trimap import TRIMAP

num_comp = 4 # You can choose a different number of components for TriMap

# Create and fit TriMap with the original features
trimap = TRIMAP(n_dims=num_comp, n_inliers=6, n_outliers=12, n_random=4, distance='manhattan', weight_temp=1000.0, lr=100.0, n_iters=1500, verbose=True)
trimap.fit(X)

columns = ['trimap' + str(i) for i in range(0, num_comp)]

# Access the reduced features from the embedding_ attribute
X_trimap = trimap.embedding_

# Create a dataframe with the reduced features
X_df = pd.DataFrame(X_trimap, columns=columns)

TRIMAP(n_inliers=6, n_outliers=12, n_random=4, distance=manhattan, weight_temp=1000.0, lr=100.0, n_iters=1500, apply_pca=True, opt_method=dbd, verbose=True, return_seq=False)
running TriMap on 9804 points with dimension 1992
pre-processing
applied PCA
found nearest neighbors
sampled triplets
running TriMap with dbd
Iteration:  100 / 1500, Loss: 0.000, Violated triplets: 2.6318
Iteration:  200 / 1500, Loss: 0.000, Violated triplets: 2.7301
Iteration:  300 / 1500, Loss: 0.000, Violated triplets: 2.8167
Iteration:  400 / 1500, Loss: 0.000, Violated triplets: 2.8827
Iteration:  500 / 1500, Loss: 0.000, Violated triplets: 2.9199
Iteration:  600 / 1500, Loss: 0.000, Violated triplets: 2.9424
Iteration:  700 / 1500, Loss: 0.000, Violated triplets: 2.9619
Iteration:  800 / 1500, Loss: 0.000, Violated triplets: 2.9749
Iteration:  900 / 1500, Loss: 0.000, Violated triplets: 2.9851
Iteration: 1000 / 1500, Loss: 0.000, Violated triplets: 2.9931
Iteration: 1100 / 1500, Loss: 0.000, Violated triplet

## WPCA

In [4]:
from wpca import WPCA

num_comp = 128

# Define feature weights
# released, rating, playtime, 19 genres, 1994-24 tags, series dict 
w = np.array([[.7, .6, 4] + [1] * 19 + [.5] * (1994 - 24)] * 9804)

# Create and fit WPCA with feature weights
wpca = WPCA(n_components=num_comp)
wpca.fit(X, weights=w)

columns = ['pca' + str(i) for i in range(0, num_comp)]

# Transform features to lower dimensionality
X_wpca = wpca.transform(X)

# Create a dataframe with the transformed features and the target
X_df = pd.DataFrame(X_wpca, columns=columns)


In [5]:
print(wpca.explained_variance_ratio_)

[0.08034823 0.05336608 0.03936573 0.03032157 0.02865338 0.02679904
 0.02131693 0.01947955 0.01910622 0.01708147 0.01378495 0.013499
 0.01282992 0.01205059 0.01175677 0.0115811  0.01136731 0.01044218
 0.00978947 0.0094734  0.00935663 0.00908754 0.00849713 0.00797058
 0.00759595 0.0073008  0.00708377 0.00692518 0.00661844 0.00629393
 0.00624454 0.00592846 0.00574219 0.00568674 0.00546639 0.00539125
 0.00533596 0.00516247 0.00498238 0.00485034 0.00482227 0.00472267
 0.00465786 0.00459959 0.00448281 0.00436273 0.00434273 0.00428767
 0.00420638 0.00406887 0.00403946 0.00399563 0.0039396  0.00390266
 0.00384317 0.00373528 0.00362599 0.00359996 0.00349504 0.00341716
 0.00338099 0.00327909 0.00325617 0.00322249 0.0031739  0.00310131
 0.00306876 0.00302646 0.00297292 0.00291266 0.00287388 0.00281477
 0.0027843  0.00275526 0.00270718 0.00266549 0.00260009 0.00256999
 0.00253163 0.00250286 0.00249119 0.0024381  0.00241788 0.00240349
 0.00238789 0.00232189 0.00231943 0.00229408 0.00224499 0.002228

## Similarity Metrics

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import pdist, squareform

cosine_sim = cosine_similarity(X_df, X_df)
euclidean_dist = squareform(pdist(X_df))

In [7]:
#euclidean_dist

In [8]:
#cosine_sim

In [9]:
X_df['name'] = y

# Set the name as the index
X_df = X_df.set_index('name')

In [10]:
def get_euclidean_recommendations(title, k):
    idx = X_df.index.get_loc(title)
    sim_scores = euclidean_dist[idx]
    sim_indices = sim_scores.argsort()[:k]
    sim_titles = X_df.iloc[sim_indices].index
    return list(sim_titles)

def get_cosign_recommendations(title, k):
    idx = X_df.index.get_loc(title)
    sim_scores = cosine_sim[idx]
    sim_indices = sim_scores.argsort()[-k:][::-1]
    sim_titles = X_df.iloc[sim_indices].index
    return list(sim_titles)

In [11]:
recommendation = "Lego City Undercover"

In [12]:
get_euclidean_recommendations(recommendation, 20)

['Lego City Undercover',
 'LEGO Marvel Super Heroes 2',
 'LEGO The Incredibles',
 'LEGO Worlds',
 'Overlord: Fellowship of Evil',
 'LEGO DC Super-Villains',
 'Just Die Already',
 'Think of the Children',
 "Super Lucky's Tale",
 'Disc Jam',
 'LEGO Marvel Super Heroes',
 'Scribblenauts Unmasked: A DC Comics Adventure',
 'LEGO The Hobbit',
 'Disney Universe',
 'Shiftlings',
 'WWE 2K18',
 'Totally Reliable Delivery Service',
 'Red Wings: Aces of the Sky',
 'Omno',
 "Black Future '88"]

In [13]:
get_cosign_recommendations(recommendation, 15)

['Lego City Undercover',
 'LEGO Marvel Super Heroes 2',
 'LEGO Worlds',
 'LEGO Marvel Super Heroes',
 'LEGO Jurassic World',
 'LEGO Batman 3: Beyond Gotham',
 'LEGO The Hobbit',
 'LEGO The Incredibles',
 'The LEGO Movie - Videogame',
 "LEGO Marvel's Avengers",
 'Scribblenauts Unmasked: A DC Comics Adventure',
 'Just Die Already',
 'LEGO Batman 2 DC Super Heroes',
 'Ben 10: Power Trip',
 'Saints Row: The Third Remastered']

#### This will take a long time, I added a loading bar for u 
This is taking the dictionary of games that in the same series and boosting their scores so that they will be more likely to be recommended

In [22]:
# Define a constant value to boost the similarity score
boost = 0.2
boost2 = boost*5

# Create a boolean matrix that indicates if two games are in the same series
series_mask = np.array([[game_id_j in series_dict.get(game_id_i, []) for game_id_j in df['id']] for game_id_i in df['id']])

# Add the boost value to the similarity score where the mask is True
cosine_sim[series_mask] += boost
euclidean_dist[series_mask] += boost2

# Print the modified cosine similarity matrix
print(cosine_sim)


[[ 1.          0.05919384 -0.14147912 ... -0.23118528 -0.22138062
  -0.08359288]
 [ 0.05919384  1.          0.38768704 ... -0.00366065  0.0079375
  -0.18907453]
 [-0.14147912  0.38768704  1.         ...  0.00321481 -0.07930926
  -0.1154011 ]
 ...
 [-0.23118528 -0.00366065  0.00321481 ...  1.          0.91216788
   0.08888081]
 [-0.22138062  0.0079375  -0.07930926 ...  0.91216788  1.
   0.12012742]
 [-0.08359288 -0.18907453 -0.1154011  ...  0.08888081  0.12012742
   1.        ]]


In [25]:
get_cosign_recommendations(recommendation, 15)

['Lego City Undercover',
 'LEGO Marvel Super Heroes 2',
 'LEGO Worlds',
 'LEGO Marvel Super Heroes',
 'LEGO Jurassic World',
 'LEGO Batman 3: Beyond Gotham',
 'LEGO The Hobbit',
 'LEGO The Incredibles',
 'The LEGO Movie - Videogame',
 "LEGO Marvel's Avengers",
 'Scribblenauts Unmasked: A DC Comics Adventure',
 'Just Die Already',
 'LEGO Batman 2 DC Super Heroes',
 'Ben 10: Power Trip',
 'Saints Row: The Third Remastered']

In [24]:
get_euclidean_recommendations(recommendation, 20)

['Lego City Undercover',
 'LEGO Marvel Super Heroes 2',
 'LEGO The Incredibles',
 'LEGO Worlds',
 'Overlord: Fellowship of Evil',
 'LEGO DC Super-Villains',
 'Just Die Already',
 'Think of the Children',
 "Super Lucky's Tale",
 'Disc Jam',
 'LEGO Marvel Super Heroes',
 'Scribblenauts Unmasked: A DC Comics Adventure',
 'LEGO The Hobbit',
 'Disney Universe',
 'Shiftlings',
 'WWE 2K18',
 'Totally Reliable Delivery Service',
 'Red Wings: Aces of the Sky',
 'Omno',
 "Black Future '88"]

## Save
Saving both the cosine and euclidean for each model
### WPCA

In [18]:
# save the matrix to a file
np.save('cosine_sim_wpca.npy', cosine_sim)
np.save('euclidean_dist_wpca.npy', euclidean_dist)
X_df.to_csv('Names')

In [17]:
X_df

Unnamed: 0_level_0,pca0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,...,pca118,pca119,pca120,pca121,pca122,pca123,pca124,pca125,pca126,pca127
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
King Arthur's Gold,1.773234,1.032150,1.902629,-0.440357,0.805671,0.141405,0.232584,-0.690620,0.380632,-0.471294,...,0.033086,0.359804,0.136007,-0.300612,0.409327,-0.019106,-0.224429,-0.194204,-0.213097,-0.103955
Grand Theft Auto V,0.357836,2.505352,-1.145935,0.539819,-0.465173,0.444491,-0.358961,0.097099,0.441402,0.568385,...,0.010285,-0.062953,0.060661,-0.015445,-0.143621,-0.259785,0.093891,0.077333,-0.162321,-0.105063
The Witcher 3: Wild Hunt,0.034730,0.842484,-2.244771,1.733561,-0.223351,-0.552821,0.004105,0.190435,0.133942,-0.030671,...,0.069160,0.203803,-0.167136,-0.086792,0.066610,0.036848,-0.242603,-0.380162,-0.061129,-0.282251
Portal 2,1.995394,1.933911,0.416429,-0.246678,-0.516543,1.096432,-1.677157,-0.473507,0.658938,0.308909,...,-0.033717,-0.106452,-0.080646,-0.168641,-0.006004,-0.164825,0.100167,-0.151331,0.125814,-0.007144
Tomb Raider (2013),-0.248617,0.985814,-1.616339,0.437725,-0.327508,0.070863,-0.353410,-0.600473,0.407227,0.035671,...,-0.107254,-0.056279,-0.048457,0.091858,0.041770,0.043406,0.082340,-0.001969,0.032949,0.274814
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Castle of no Escape,1.136410,-0.462946,-0.491961,0.891507,-0.916404,-0.133119,0.994670,-0.640544,0.068114,-0.377724,...,-0.022223,-0.183706,0.108173,-0.038613,0.017844,-0.093073,-0.149352,0.445049,0.189627,-0.008595
Dying Light: The Following,-1.212584,0.203153,-0.573591,-0.482775,-0.239903,-0.504783,-0.096952,0.224123,0.021455,-0.748305,...,0.000877,0.005701,0.019756,-0.005927,-0.027077,0.012662,0.020231,0.006745,0.017759,0.009347
Fallout 3: Operation Anchorage,-1.463108,0.094480,-0.039547,0.008626,-0.683196,-0.451978,0.010053,-0.842529,-0.060416,-0.210790,...,0.049912,0.046875,-0.014027,0.008299,0.005144,-0.011430,-0.028412,-0.014030,0.039137,-0.011168
Soldier of Fortune: Payback,-1.511783,0.075174,0.000994,-0.455371,-0.749236,-0.179334,-0.102654,-0.723883,-0.027928,-0.096324,...,0.051850,-0.019948,-0.019920,0.006016,0.039221,-0.036867,-0.007270,0.028985,0.032589,-0.013948


### Trimap

In [None]:
np.save('cosine_sim_tri.npy', cosine_sim)
np.save('euclidean_dist_tri.npy', euclidean_dist)