guesslang requires tensorflow 2.5 which does not run on apple silicon. Using colab to download and label the github dataset

In [1]:
!pip install guesslang datasets zstandard

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting guesslang
  Downloading guesslang-2.2.1-py3-none-any.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.8.0-py3-none-any.whl (452 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting zstandard
  Downloading zstandard-0.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorflow==2.5.0
  Downloading tensorflow-2.5.0-cp38-cp38-manylinux2010_x86_64.whl (454.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m454.4/454.4 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting keras-nightly~=2

In [2]:
import datasets
import guesslang
import os
import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
PYTHON_THRESHOLD = 0.9
LANG_THRESHOLD = 0.8
N_SEQS = 100_000
LANGS = {
    'C++',
    'C',
    'Go',
    'HTML',
    'Java',
    'JavaScript',
    'PHP',
    'Python',
    'XML'
}
#C++, C, GO, HTML, Java, Javascript, PHP, Python, XML

In [4]:
pile = datasets.load_dataset('the_pile', split="test", streaming=True)
github_pile = pile.filter(lambda x: x['meta']['pile_set_name'] == 'Github')
github_subset = github_pile.take(N_SEQS)
github_subset = datasets.Dataset.from_list(list(github_subset))

Downloading builder script:   0%|          | 0.00/9.26k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.9k [00:00<?, ?B/s]



In [5]:
guesser = guesslang.Guess()
prob_list = []
lang_list = []
for i in range(len(github_subset)):
    probs = guesser.probabilities(github_subset[i]['text'])
    prob_list.append(probs[0][1])
    lang_list.append(probs[0][0] if probs[0][0] in LANGS else 'other')
    if i % 1000 == 0:
        print(i)


0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000


In [6]:
github_subset = github_subset.add_column('lang_prob', prob_list)
github_subset = github_subset.add_column('lang', lang_list)

lang_id_dataset = github_subset.filter(lambda x: x['lang'] != 'other' and x['lang_prob'] >= LANG_THRESHOLD)
python_dataset = github_subset.filter(lambda x: x['lang'] == 'Python' and x['lang_prob'] >= PYTHON_THRESHOLD)

  0%|          | 0/19 [00:00<?, ?ba/s]

  0%|          | 0/19 [00:00<?, ?ba/s]

In [7]:
len(lang_id_dataset), len(python_dataset)

(7232, 607)

In [8]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [9]:
data_path = '/content/gdrive/MyDrive/datasets/github/'

lang_id_dataset.save_to_disk(os.path.join(data_path, 'github_lang_id.hf'))
python_dataset.save_to_disk(os.path.join(data_path, 'python_source.hf'))

Flattening the indices:   0%|          | 0/8 [00:00<?, ?ba/s]

Saving the dataset (0/1 shards):   0%|          | 0/7232 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

Saving the dataset (0/1 shards):   0%|          | 0/607 [00:00<?, ? examples/s]

In [10]:
import numpy as np

In [11]:
np.unique(np.array(lang_id_dataset['lang']), return_counts=True)

(array(['C', 'C++', 'Go', 'HTML', 'Java', 'JavaScript', 'PHP', 'Python',
        'XML'], dtype='<U10'),
 array([ 949, 1019,  817,  519, 1183,  716,  631,  637,  761]))