## I. Get the data

##### 1. Scrape the title of game from Wikipedia

In [1]:
import requests
import bs4 as bs

elements = []
url = requests.get('https://en.wikipedia.org/wiki/List_of_Nintendo_Switch_games_(0%E2%80%939_and_A)').text
soup = bs.BeautifulSoup(url, 'lxml')
table = soup.find('table', class_='wikitable plainrowheaders sortable')

for i in table.find_all('tr'):
    try:
        game = i.find_all('th')
        name = game[0].find(text=True)
        link = game[0].find(href=True)['href']
    except: pass
    else:        
        elements.append([name, link])



##### 2. Create a dataframe 

In [2]:
import pandas as pd
df_games = pd.DataFrame(elements, columns=['title', 'link'])
df_games = df_games.astype(str)
df_games = df_games.applymap(lambda x: x.replace('\n', ''))
df_games = df_games.applymap(lambda x: x.replace(':', ''))
print('Shape of dataframe: ', df_games.shape, '\n')
df_games.head() 

Shape of dataframe:  (115, 2) 



Unnamed: 0,title,link
0,1-2-Switch,/wiki/1-2-Switch
1,10 Second Ninja X,/wiki/10_Second_Ninja_X
2,13 Sentinels Aegis Rim,/wiki/13_Sentinels_Aegis_Rim
3,140,/wiki/140_(video_game)
4,1979 Revolution Black Friday,/wiki/1979_Revolution_Black_Friday


##### 3. Scrape information about the game

In [5]:
wiki_origin = 'https://en.wikipedia.org'
content = []
for index, row in df_games.iterrows():
    url = wiki_origin + row['link']
    detail_url = requests.get(url).text
    soup = bs.BeautifulSoup(detail_url,'lxml')
    text = ''
    for section in soup.find_all('h2'):
        if section.text.startswith('Game') or section.text.startswith('Plot'):
            text += section.text + '\n\n'
            for element in section.next_siblings:
                if element.name and element.name.startswith('h'):
                    break
                elif element.name == 'p':
                    text += element.text + '\n'
        else: pass
    if not text:
        content.append(None)
    else:
        content.append(text)



##### 4. Clean the text of information

In [6]:
content_clean = []
for text in content:
    if text is not None:
        text = text.replace(r'\[.*?\]+', '')
        text = text.replace('\n', ' ')
        text = text.replace('[edit]', '')
        text = text.replace('Gameplay ', '')
        text = text.replace('Game-play ', '')
        text = text.replace('Plot ', '')
        content_clean.append(text)
    else:
        content_clean.append(None)

df_games['content'] = content_clean

todrop = df_games[df_games['title']=='Untitled '].index.tolist()
df_games.drop(index=todrop, inplace=True)

df_games.head()

Unnamed: 0,title,link,content
0,1-2-Switch,/wiki/1-2-Switch,1-2-Switch is a party game in which players d...
1,10 Second Ninja X,/wiki/10_Second_Ninja_X,10 Second Ninja X is a sidescrolling puzzle p...
2,13 Sentinels Aegis Rim,/wiki/13_Sentinels_Aegis_Rim,13 Sentinels: Aegis Rim is a video game where...
3,140,/wiki/140_(video_game),"As described by Carlsen, 140 is ""an old schoo..."
4,1979 Revolution Black Friday,/wiki/1979_Revolution_Black_Friday,


##### 5. Save the dataset

In [7]:
df_games = df_games.dropna()
df_games.to_csv('dataset.csv')   

## II. Build the model

#### 1. Tokenize the text

In [8]:
import nltk
nltk.download('punkt')
import re
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = [token for token in tokens if re.search('[a-zA-Z]', token)]
    stems = [stemmer.stem(word) for word in filtered_tokens]
    return stems

#### 2. Feature extraction from text

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem,
                                 ngram_range=(1,3))
tfidf_matrix = tfidf_vectorizer.fit_transform([x for x in df_games["content"]])



##### 3. Apply Kmeans to cluster similar games

In [11]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=7)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()
df_games["cluster"] = clusters

##### 4. Calculate the similarity distance of each game

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

similarity_distance = 1 - cosine_similarity(tfidf_matrix)
mergings = linkage(similarity_distance, method='complete')

In [13]:
vals = df_games['title'].tolist()
similarity_df = pd.DataFrame(similarity_distance, columns=vals, index=vals)

##### 5. Recommend the similar items to one of the game

In [14]:
title = '1-2-Switch'
matches = similarity_df[title].sort_values()[1:6]
matches = matches.index.tolist()
df_games.set_index('title').loc[matches]

Unnamed: 0_level_0,link,content,cluster
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ancestors Legacy,/wiki/Ancestors_Legacy,Ancestor's Legacy offers two game modes: a si...,1
Another World,/wiki/Another_World_(video_game),"Another World is a platform game, featuring a...",5
Aaero,/wiki/Aaero,The basic controls are that of a twin-stick s...,5
Astria Ascending,/wiki/Astria_Ascending,Astria Ascending primarily features two modes...,5
Ape Out,/wiki/Ape_Out,Ape Out is a single player beat 'em up video ...,5
