# Cluster

Some helpful links:

- https://docs.voxel51.com/tutorials/clustering.html
- https://docs.voxel51.com/tutorials/dimension_reduction.html

In [None]:
# folder setup
import os
path_samples = os.path.abspath(os.getcwd()) + '/screenshots/'

# essentials ...
import sys, math
def progress_bar(count_value, total):
    filled_up_Length = int(math.floor(100 * count_value / total)) + 1
    bar = '=' * filled_up_Length + '-' * (100 - filled_up_Length)
    sys.stdout.write('[%s]\r' %(bar))
    sys.stdout.flush()

def done():
    print('\n\nDone 👍🏻\n')

## Install FiftyOne

In [None]:
!pip uninstall -y opencv-python
!pip install -U fiftyone torch torchvision umap-learn scikit-learn
!fiftyone plugins download https://github.com/jacobmarks/clustering-plugin

## Prepare Dataset

In [None]:
import os, csv, json
import fiftyone as fo

dataset = fo.Dataset('ludens')

# open the wikidata query file to link wikidata_id in filename and csv 
with open('query.csv') as games_data:

    # load reformated mobygames platforms file to get proper platform label
    with open('platforms_reformatted.json') as platforms_data:
        games_rows = csv.reader(games_data, delimiter=',', quotechar='"')
        games = {}
        platforms = json.load(platforms_data)

        # save game title and country of origin in tuple
        for game in games_rows:
            games[game[0].split('/')[-1]] = {
                'title': game[1],
                'countries': game[4].replace(',', '|') 
            }

        # go through all screenshots in the samples folder
        for path, folders, files in os.walk(path_samples):

            sample_count = 0
            
            for screenshot in files:
                
                filepath = os.path.join(path, screenshot)

                wikidata_id = screenshot.split('_')[0]

                title = games[wikidata_id]['title']
                try:
                    platform = platforms[screenshot.split('_')[3]]['platform_name']
                except:
                    platform = ''
                years = screenshot.split('_')[1]
                years = years.split('-')
                countries = games[wikidata_id]['countries']

                sample = fo.Sample(filepath=filepath)

                sample['wikidata_id'] = wikidata_id
                sample['title'] = title
                sample['platform'] = platform
                sample['years'] = years
                sample['countries'] = countries
            
                dataset.add_sample(sample)

                sample_count += 1
                progress_bar(sample_count, len(files))

done()
print(dataset)

In [None]:
session = fo.launch_app(dataset, auto=False)

Open [FiftyOne 🔗](http://localhost:5151).

### Calculate Embeddings

In [None]:
import fiftyone.brain as fob
import fiftyone.zoo as foz

resnet50 = foz.load_zoo_model("resnet50-imagenet-torch")

dataset.compute_embeddings(
    resnet50,
    embeddings_field="resnet50_embeddings"
)

### Compute Visualisations

#### UMAP
- default parameters
- lower minimal distance and less neighbours to break up global structures
- enlargen minimal distance and have more neighbours to pack the vis tighter
- [fiftyone.brain.visualization.UMAPVisualizationConfig](https://docs.voxel51.com/api/fiftyone.brain.visualization.html#fiftyone.brain.visualization.UMAPVisualizationConfig)

In [None]:
res = fob.compute_visualization(
    dataset,
    embeddings="resnet50_embeddings",
    method="umap",
    brain_key="resnet50_umap_nn100_vis",
    num_neighbors=100
)

dataset.set_values("resnet50_umap_nn100", res.current_points)

In [None]:
res = fob.compute_visualization(
    dataset,
    embeddings="resnet50_embeddings",
    method="umap",
    brain_key="resnet50_umap_md0001_nn5_vis",
    min_dist=0.001,
    num_neighbors=5
)

dataset.set_values("resnet50_umap_md0001_nn5", res.current_points)