In [11]:
import re

from tqdm import tqdm
import pandas as pd

In [16]:
train_file_name = "../data/py150/py150_files/python100k_train.txt"
eval_file_name = "../data/py150/py150_files/python50k_eval.txt"
def read_train_data(file_name, batch_size=None):
    file_names = []
    with open(file_name, "r") as f:
        while True:
            line = f.readline()
            if not line:
                break

            file_names += [line.strip()]

            if batch_size and len(file_names) >= batch_size:
                break

    return file_names

def read_file_to_string(filename):
    f = open(filename, "rb")
    s = ""
    try:
        s = f.read()
    except:
        print(filename)
    f.close()
    return s.decode(errors="replace")

train_data = read_train_data(train_file_name)

In [27]:
def count_comments(input_code):
    comment = r"#.*"
    search = re.findall(comment, input_code)
    comment_len = 0
    for comment in search:
        comment_len += len(comment)
    return len(search), comment_len

def create_df(data):
    data_df = pd.DataFrame(data, columns=["file_name"])

    script_file_name_regex = re.compile(r"data/([^/]+)/([^/]+)/.+")

    usernames = []
    repos = []
    py_scripts = []
    comments = []
    comment_lens = []
    comment_dens = []
    line_counts = []

    for file_name in tqdm(data_df["file_name"]):

        match = script_file_name_regex.search(file_name)
        if not match:
            print(file_name)

        username = match.group(1)
        repo_name = match.group(2)

        file_string = read_file_to_string(f"../data/py150/py150_files/{file_name}")
        line_count = len(file_string.split('\n'))
        
        comment_count, comment_len = count_comments(file_string)
        comment_den = round(comment_count / line_count, 6) if line_count > 0 else 0

        usernames += [username]
        repos += [repo_name]
        py_scripts += [file_string]
        comments += [comment_count]
        comment_lens += [comment_len]
        comment_dens += [comment_den]
        line_counts += [line_count]

    data_df["user_name"] = usernames
    data_df["repo_name"] = repos
    data_df["py_script"] = py_scripts
    data_df["comment"] = comments
    data_df["comment_len"] = comment_lens
    data_df["comment_den"] = comment_dens
    data_df["line_count"] = line_counts

    return data_df

In [28]:
print(len(train_data))

100000


In [29]:
train_data_df = create_df(train_data)

100%|██████████| 100000/100000 [00:03<00:00, 27143.85it/s]


In [30]:
train_data_df

Unnamed: 0,file_name,user_name,repo_name,py_script,comment,comment_len,comment_den,line_count
0,data/00/wikihouse/urls.py,00,wikihouse,#!/usr/bin/env python\n# -*- coding: utf-8 -*-...,2,44,0.017241,116
1,data/0rpc/zerorpc-python/zerorpc/events.py,0rpc,zerorpc-python,# -*- coding: utf-8 -*-\n# Open Source Initiat...,24,1244,0.066116,363
2,data/0xadada/dockdj/app/manage.py,0xadada,dockdj,"#!/usr/bin/env python\n""""""Django's command lin...",1,21,0.076923,13
3,data/1stvamp/hippybot/setup.py,1stvamp,hippybot,"""""""Installer for hippybot\n""""""\n\nimport os\nc...",0,0,0.000000,34
4,data/2buntu/2buntu-blog/manage.py,2buntu,2buntu-blog,#!/usr/bin/env python\nimport os\nimport sys\n...,1,21,0.090909,11
...,...,...,...,...,...,...,...,...
99995,data/Havate/havate-openstack/proto-build/gui/h...,Havate,havate-openstack,# vim: tabstop=4 shiftwidth=4 softtabstop=4\n\...,15,645,0.483871,31
99996,data/Havate/havate-openstack/proto-build/gui/h...,Havate,havate-openstack,# vim: tabstop=4 shiftwidth=4 softtabstop=4\n\...,21,807,0.344262,61
99997,data/Havate/havate-openstack/proto-build/gui/h...,Havate,havate-openstack,# vim: tabstop=4 shiftwidth=4 softtabstop=4\n\...,18,771,0.028037,642
99998,data/Havate/havate-openstack/proto-build/gui/h...,Havate,havate-openstack,# vim: tabstop=4 shiftwidth=4 softtabstop=4\n\...,20,846,0.050000,400
