In [5]:
!pip install fuzzywuzzy
!pip install python-Levenshtein

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from itertools import combinations

import pandas as pd
import numpy as np
import random
from scipy import stats

data = pd.read_csv('https://raw.githubusercontent.com/zhenyachess/artificial_intelligence_methods/main/5/ign.csv')

Посмотрим первые строки выборки:

In [7]:
data.head()

Unnamed: 0.1,Unnamed: 0,score_phrase,title,url,platform,score,genre,editors_choice,release_year,release_month,release_day
0,0,Amazing,LittleBigPlanet PS Vita,/games/littlebigplanet-vita/vita-98907,PlayStation Vita,9.0,Platformer,Y,2012,9,12
1,1,Amazing,LittleBigPlanet PS Vita -- Marvel Super Hero E...,/games/littlebigplanet-ps-vita-marvel-super-he...,PlayStation Vita,9.0,Platformer,Y,2012,9,12
2,2,Great,Splice: Tree of Life,/games/splice/ipad-141070,iPad,8.5,Puzzle,N,2012,9,12
3,3,Great,NHL 13,/games/nhl-13/xbox-360-128182,Xbox 360,8.5,Sports,N,2012,9,11
4,4,Great,NHL 13,/games/nhl-13/ps3-128181,PlayStation 3,8.5,Sports,N,2012,9,11


Определим метрики:

In [8]:
def f_ratio(s1, s2):
  return fuzz.ratio(s1, s2)/100

def f_partial_ratio(s1, s2):
  return fuzz.partial_ratio(s1, s2)/100

def f_token_sort_ratio(s1, s2):
  return fuzz.token_sort_ratio(s1, s2)/100

def f_token_set_ratio(s1, s2):
  return fuzz.token_set_ratio(s1, s2)/100

def f_wratio(s1, s2):
  return fuzz.WRatio(s1, s2)/100

def compare_float(x1, x2):
  return 1.0 if abs(x1 - x2) < 0.5 else 0.0

def compare_num(x1, x2):
  return 1.0 if x1 == x2 else 0.0

def calc_fuz(s1, s2, method):
  if method == 1:
    a = fuzz.ratio(s1, s2)/100
  elif method == 2:
    a = fuzz.partial_ratio(s1, s2)
  elif method == 3:
    a = fuzz.token_sort_ratio(s1, s2)
  elif method == 4:
    a = fuzz.token_set_ratio(s1, s2)
  else:
    a = fuzz.WRatio(s1, s2)

  return a/100

print(f_partial_ratio("Amazing", "Great"))

0.2


Возьмём для тестирования срез исходной выборки до 50 строк.

In [9]:
DATA_SIZE = 50

test_data = data[:DATA_SIZE].copy()
test_data.head(DATA_SIZE)

Unnamed: 0.1,Unnamed: 0,score_phrase,title,url,platform,score,genre,editors_choice,release_year,release_month,release_day
0,0,Amazing,LittleBigPlanet PS Vita,/games/littlebigplanet-vita/vita-98907,PlayStation Vita,9.0,Platformer,Y,2012,9,12
1,1,Amazing,LittleBigPlanet PS Vita -- Marvel Super Hero E...,/games/littlebigplanet-ps-vita-marvel-super-he...,PlayStation Vita,9.0,Platformer,Y,2012,9,12
2,2,Great,Splice: Tree of Life,/games/splice/ipad-141070,iPad,8.5,Puzzle,N,2012,9,12
3,3,Great,NHL 13,/games/nhl-13/xbox-360-128182,Xbox 360,8.5,Sports,N,2012,9,11
4,4,Great,NHL 13,/games/nhl-13/ps3-128181,PlayStation 3,8.5,Sports,N,2012,9,11
5,5,Good,Total War Battles: Shogun,/games/total-war-battles-shogun/mac-142565,Macintosh,7.0,Strategy,N,2012,9,11
6,6,Awful,Double Dragon: Neon,/games/double-dragon-neon/xbox-360-131320,Xbox 360,3.0,Fighting,N,2012,9,11
7,7,Amazing,Guild Wars 2,/games/guild-wars-2/pc-896298,PC,9.0,RPG,Y,2012,9,11
8,8,Awful,Double Dragon: Neon,/games/double-dragon-neon/ps3-131321,PlayStation 3,3.0,Fighting,N,2012,9,11
9,9,Good,Total War Battles: Shogun,/games/total-war-battles-shogun/pc-142564,PC,7.0,Strategy,N,2012,9,11


Создадим словарь для метрики для каждого столбца и заданного веса, а также словарь, в котором в качестве ключей инициализированы номера колонок, а в качестве значений название колонок.

In [69]:
dict_comparison = {
    0: (f_ratio, 0.0),
    1: (f_token_sort_ratio, 1.0),
    2: (f_token_set_ratio, 0.0),
    3: (f_ratio, 0.0),
    4: (compare_float, 0.8),
    5: (f_token_sort_ratio, 1.0),
    6: (f_ratio, 0.0),
    7: (compare_num, 0.0),
    8: (compare_num, 0.0),
    9: (compare_num, 0.0)
}

names = list(test_data.columns[1:])
indexes = [i for i in range(len(names))]
dict_columns = dict(zip(indexes, names))
print(dict_columns)

{0: 'score_phrase', 1: 'title', 2: 'url', 3: 'platform', 4: 'score', 5: 'genre', 6: 'editors_choice', 7: 'release_year', 8: 'release_month', 9: 'release_day', 10: 'GROUP'}


Создадим функцию, которая будет вычислять номер кластера для каждой строки в датасете, учитывая веса и метрики для определенных столбцов.

In [70]:
def make_clusters(td, d_comparison, d_columns): 
  dict_group = dict()
  BOUND = 0.7
  td['GROUP'] = ''
  CNT_CL = 0

  columns = [(td[d_columns[index]], d_comparison[index][0], d_comparison[index][-1]) for index in range(len(d_columns)-1) if d_comparison[index][-1] != 0]
  w = [column[-1] for column in columns]
  functions = [column[1] for column in columns]

  # Проходим по всем строкам выборки для столбца title
  for i in range(len(test_data)):
    # Записываем i-тое название в каждый элемент нового списка

    values_i = [column[0][i] for column in columns]


    # При первой итерации создаем кластер 0
    # Добавляем в него values
    # В таблице отмечаем номер кластера
    if i == 0:
      dict_group[i] = [tuple(values_i)]
      test_data['GROUP'][i] = i
    else:
      # Проходимся по всем строкам до текущей без включения
      for j in range(i):
        # Записываем j-тое название в отдельную переменную
        values_j = [column[0][j] for column in columns]

        s = 0
        for index in range(len(w)):
          s += (functions[index](values_i[index], values_j[index])) * w[index]

        if s / len(w) > BOUND:
          # Записываем номер класса в переменную
          index = test_data['GROUP'][j]
          # Добавляем кортеж в кластер
          dict_group[index].append(tuple(values_i))
          # В таблице отмечаем номер кластера
          test_data['GROUP'][i] = index
          # Выходим из цикла, так как кластер найден
          break
      # Если не было похожих строк, то необходимо создать новый кластер
      else:
        # Увеличиваем номер кластера
        CNT_CL += 1
        # Создаем кластер с кортежом
        dict_group[CNT_CL] = [tuple(values_i)]
        # В таблице отмечаем номер кластера
        test_data['GROUP'][i] = CNT_CL

Проведем кластеризацию выборки по заданным метрикам и весам для выбранных столбцов.

In [71]:
make_clusters(test_data, dict_comparison, dict_columns)
pd.set_option('display.max_rows', DATA_SIZE)
test_data.sort_values(by='GROUP').head(DATA_SIZE)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0.1,Unnamed: 0,score_phrase,title,url,platform,score,genre,editors_choice,release_year,release_month,release_day,GROUP
0,0,Amazing,LittleBigPlanet PS Vita,/games/littlebigplanet-vita/vita-98907,PlayStation Vita,9.0,Platformer,Y,2012,9,12,0
1,1,Amazing,LittleBigPlanet PS Vita -- Marvel Super Hero E...,/games/littlebigplanet-ps-vita-marvel-super-he...,PlayStation Vita,9.0,Platformer,Y,2012,9,12,0
2,2,Great,Splice: Tree of Life,/games/splice/ipad-141070,iPad,8.5,Puzzle,N,2012,9,12,1
3,3,Great,NHL 13,/games/nhl-13/xbox-360-128182,Xbox 360,8.5,Sports,N,2012,9,11,2
4,4,Great,NHL 13,/games/nhl-13/ps3-128181,PlayStation 3,8.5,Sports,N,2012,9,11,2
5,5,Good,Total War Battles: Shogun,/games/total-war-battles-shogun/mac-142565,Macintosh,7.0,Strategy,N,2012,9,11,3
9,9,Good,Total War Battles: Shogun,/games/total-war-battles-shogun/pc-142564,PC,7.0,Strategy,N,2012,9,11,3
6,6,Awful,Double Dragon: Neon,/games/double-dragon-neon/xbox-360-131320,Xbox 360,3.0,Fighting,N,2012,9,11,4
8,8,Awful,Double Dragon: Neon,/games/double-dragon-neon/ps3-131321,PlayStation 3,3.0,Fighting,N,2012,9,11,4
7,7,Amazing,Guild Wars 2,/games/guild-wars-2/pc-896298,PC,9.0,RPG,Y,2012,9,11,5
