Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reorganize the souce code, update the scripts and the corresponding README #4

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 144 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
.vscode/*
log*
log.preprocess
java-dataset/*
.dockerignore

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
# lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/
Binary file removed evaluation/__init__.pyc
Binary file not shown.
Binary file removed evaluation/bleu/__init__.pyc
Binary file not shown.
Binary file removed evaluation/bleu/bleu.pyc
Binary file not shown.
Binary file removed evaluation/cider/__init__.pyc
Binary file not shown.
Binary file removed evaluation/cider/cider.pyc
Binary file not shown.
Binary file removed evaluation/cider/cider_scorer.pyc
Binary file not shown.
Binary file removed evaluation/meteor/__init__.pyc
Binary file not shown.
Binary file removed evaluation/meteor/meteor.pyc
Binary file not shown.
Binary file removed evaluation/rouge/__init__.pyc
Binary file not shown.
Binary file removed evaluation/rouge/rouge.pyc
Binary file not shown.
Binary file removed lib/__init__.pyc
Binary file not shown.
Binary file removed lib/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file removed lib/data/Constants.pyc
Binary file not shown.
Binary file removed lib/data/Dataset.pyc
Binary file not shown.
Binary file removed lib/data/Dict.pyc
Binary file not shown.
Binary file removed lib/data/Tree.pyc
Binary file not shown.
Binary file removed lib/data/__init__.pyc
Binary file not shown.
Binary file removed lib/data/__pycache__/Constants.cpython-36.pyc
Binary file not shown.
Binary file removed lib/data/__pycache__/Dataset.cpython-36.pyc
Binary file not shown.
Binary file removed lib/data/__pycache__/Dict.cpython-36.pyc
Binary file not shown.
Binary file removed lib/data/__pycache__/Tree.cpython-36.pyc
Binary file not shown.
Binary file removed lib/data/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file removed lib/eval/Evaluator.pyc
Binary file not shown.
Binary file removed lib/eval/__init__.pyc
Binary file not shown.
Binary file removed lib/eval/__pycache__/Evaluator.cpython-36.pyc
Binary file not shown.
Binary file removed lib/eval/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file removed lib/metric/Bleu.pyc
Binary file not shown.
Binary file removed lib/metric/Loss.pyc
Binary file not shown.
Binary file removed lib/metric/PertFunction.pyc
Binary file not shown.
Binary file removed lib/metric/Reward.pyc
Binary file not shown.
Binary file removed lib/metric/__init__.pyc
Binary file not shown.
Binary file removed lib/metric/__pycache__/Bleu.cpython-36.pyc
Binary file not shown.
Binary file removed lib/metric/__pycache__/Loss.cpython-36.pyc
Binary file not shown.
Binary file removed lib/metric/__pycache__/PertFunction.cpython-36.pyc
Binary file not shown.
Binary file removed lib/metric/__pycache__/Reward.cpython-36.pyc
Binary file not shown.
Binary file removed lib/metric/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file removed lib/model/EncoderDecoder.pyc
Binary file not shown.
Binary file removed lib/model/Generator.pyc
Binary file not shown.
Binary file removed lib/model/GlobalAttention.pyc
Binary file not shown.
Binary file removed lib/model/HybridAttention.pyc
Binary file not shown.
Binary file removed lib/model/__init__.pyc
Binary file not shown.
Binary file removed lib/model/__pycache__/EncoderDecoder.cpython-36.pyc
Binary file not shown.
Binary file removed lib/model/__pycache__/Generator.cpython-36.pyc
Binary file not shown.
Binary file removed lib/model/__pycache__/GlobalAttention.cpython-36.pyc
Binary file not shown.
Binary file removed lib/model/__pycache__/HybridAttention.cpython-36.pyc
Binary file not shown.
Binary file removed lib/model/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file removed lib/parser/JavaLexer.pyc
Binary file not shown.
Binary file removed lib/parser/JavaListener.pyc
Binary file not shown.
Binary file removed lib/parser/JavaParser.pyc
Binary file not shown.
Binary file removed lib/parser/__init__.pyc
Binary file not shown.
Binary file removed lib/train/Optim.pyc
Binary file not shown.
Binary file removed lib/train/ReinforceTrainer.pyc
Binary file not shown.
Binary file removed lib/train/Trainer.pyc
Binary file not shown.
Binary file removed lib/train/__init__.pyc
Binary file not shown.
Binary file removed lib/train/__pycache__/Optim.cpython-36.pyc
Binary file not shown.
Binary file not shown.
Binary file removed lib/train/__pycache__/Trainer.cpython-36.pyc
Binary file not shown.
Binary file removed lib/train/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
42 changes: 42 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,47 @@ This repos is developed based on the environment of:
- Python 2.7
- PyTorch 0.2

However, now it is hard to build a environment meeting both.
There is a workaround without installing python2, if you use `anaconda` or `miniconda`, you can create a virtual environment by
```
conda create -n py2 python=2.7.
```

Then you can install the pytorch0.2 by

```
conda install pytorch=0.2.0 cuda90 -c pytorch
```
Before that you should be aware of your cuda version if you want to use GPU.

```
nvcc --version
```

## (Update guideline) Follow the steps using script.
After cloning the project, use `chmod +x file.sh` for each step to give them permission.

`step0-start_envirionment.sh`
Build the environment.

`step1-preproces.sh`
Preprocess the raw data.

`step2-prepare_training.sh`
You should specify the DATA_DIR to the dataset (absolute).
The number -1s are just placeholder.

`step3-training.sh`
Start training, you should specify the `--data_dir` (same as above).
You'd better specify your log path in source code `run.py`.

`step4-testing.sh`
Start testing, you should specify the `--data_dir`.
The number -1s are just placeholder.
You'd better specify your log path in source code `run.py`

After testing, the results are in the `dataset/result` folder.

## Data folder structure
/media/BACKUP/ghproj_d/code_summarization/github-python/ is the folder to save all the data in this project, please replace it to your own folder.
The data files are organized as follows in my computer:
Expand Down Expand Up @@ -54,6 +95,7 @@ python run.py test_a2c hybrid 1 0
```



## TODO
- To build the AST, on the data preprocessing, I parse the AST into a json and then parse the json into AST on training. This kind of approach is not elegant.
- On training, I don't know how to batchify the ASTs, so I have to put the ASTs into a list and encode them one by one. It's unefficient, making the training of one epoch takes about 2-3 hours. Please let me know if you have a better way to accelerate this process.
Expand Down
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
asttokens
Constants
numpy
pandas
75 changes: 53 additions & 22 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,60 @@
import subprocess
import os.path
import sys
import argparse

hostname = 'ccnt-ubuntu'

parser = argparse.ArgumentParser(description='python_process.py')
parser.add_argument('para1', type=str, help='display an para1')
parser.add_argument('para2', type=str, help='display an para2')
parser.add_argument('para3', type=str, help='display an para3')
parser.add_argument('para4', type=str, help='display an para4')
parser.add_argument('para5', type=str, help='display an para5')
parser.add_argument('para6', type=str, help='display an para6')
parser.add_argument('para7', type=str, help='display an para7')
parser.add_argument('--data_dir', type=str, default='dataset/')
parser.add_argument('--log_path', type=str, default='log.preprocess')
opt = parser.parse_args()

data_dir = opt.data_dir

if hostname == 'ccnt-ubuntu':
print(hostname)
def preprocess():
log = '/media/BACKUP/log/code_summarization/log.preprocess'
log = opt.log_path
# log = '/media/BACKUP/log/code_summarization/log.preprocess'
if os.path.exists(log):
os.system("rm -rf %s" % log)

# run = 'python preprocess.py ' \
# '-data_name github-python ' \
# '-train_src /media/BACKUP/ghproj_d/code_summarization/github-python/train/train0.60.20.2.code ' \
# '-train_tgt /media/BACKUP/ghproj_d/code_summarization/github-python/train/train0.60.20.2.comment ' \
# '-train_xe_src /media/BACKUP/ghproj_d/code_summarization/github-python/train/train0.60.20.2.code ' \
# '-train_xe_tgt /media/BACKUP/ghproj_d/code_summarization/github-python/train/train0.60.20.2.comment ' \
# '-train_pg_src /media/BACKUP/ghproj_d/code_summarization/github-python/train/train0.60.20.2.code ' \
# '-train_pg_tgt /media/BACKUP/ghproj_d/code_summarization/github-python/train/train0.60.20.2.comment ' \
# '-valid_src /media/BACKUP/ghproj_d/code_summarization/github-python/train/dev0.60.20.2.code ' \
# '-valid_tgt /media/BACKUP/ghproj_d/code_summarization/github-python/train/dev0.60.20.2.comment ' \
# '-test_src /media/BACKUP/ghproj_d/code_summarization/github-python/train/test0.60.20.2.code ' \
# '-test_tgt /media/BACKUP/ghproj_d/code_summarization/github-python/train/test0.60.20.2.comment ' \
# '-save_data /media/BACKUP/ghproj_d/code_summarization/github-python/train/processed_all ' \
# '> /media/BACKUP/log/code_summarization/log.preprocess'
run = 'python preprocess.py ' \
'-data_name github-python ' \
'-train_src /media/BACKUP/ghproj_d/code_summarization/github-python/train/train0.60.20.2.code ' \
'-train_tgt /media/BACKUP/ghproj_d/code_summarization/github-python/train/train0.60.20.2.comment ' \
'-train_xe_src /media/BACKUP/ghproj_d/code_summarization/github-python/train/train0.60.20.2.code ' \
'-train_xe_tgt /media/BACKUP/ghproj_d/code_summarization/github-python/train/train0.60.20.2.comment ' \
'-train_pg_src /media/BACKUP/ghproj_d/code_summarization/github-python/train/train0.60.20.2.code ' \
'-train_pg_tgt /media/BACKUP/ghproj_d/code_summarization/github-python/train/train0.60.20.2.comment ' \
'-valid_src /media/BACKUP/ghproj_d/code_summarization/github-python/train/dev0.60.20.2.code ' \
'-valid_tgt /media/BACKUP/ghproj_d/code_summarization/github-python/train/dev0.60.20.2.comment ' \
'-test_src /media/BACKUP/ghproj_d/code_summarization/github-python/train/test0.60.20.2.code ' \
'-test_tgt /media/BACKUP/ghproj_d/code_summarization/github-python/train/test0.60.20.2.comment ' \
'-save_data /media/BACKUP/ghproj_d/code_summarization/github-python/train/processed_all ' \
'> /media/BACKUP/log/code_summarization/log.preprocess'
'-train_src ' + data_dir + '/train/train0.60.20.2.code ' \
'-train_tgt ' + data_dir + '/train/train0.60.20.2.comment ' \
'-train_xe_src ' + data_dir + '/train/train0.60.20.2.code ' \
'-train_xe_tgt ' + data_dir + '/train/train0.60.20.2.comment ' \
'-train_pg_src ' + data_dir + '/train/train0.60.20.2.code ' \
'-train_pg_tgt ' + data_dir + '/train/train0.60.20.2.comment ' \
'-valid_src ' + data_dir + '/train/dev0.60.20.2.code ' \
'-valid_tgt ' + data_dir + '/train/dev0.60.20.2.comment ' \
'-test_src ' + data_dir + '/train/test0.60.20.2.code ' \
'-test_tgt ' + data_dir + '/train/test0.60.20.2.comment ' \
'-save_data ' + data_dir + '/train/processed_all ' \
'> ' + log
print(run)
a = os.system(run)
if a == 0:
Expand All @@ -36,16 +66,16 @@ def preprocess():

def train_a2c(start_reinforce, end_epoch, critic_pretrain_epochs, data_type, has_attn, gpus):
run = 'python a2c-train.py ' \
'-data /media/BACKUP/ghproj_d/code_summarization/github-python/train/processed_all.train.pt ' \
'-save_dir /media/BACKUP/ghproj_d/code_summarization/github-python/result/ ' \
'-embedding_w2v /media/BACKUP/ghproj_d/code_summarization/github-python/train/ ' \
'-data ' + data_dir + 'train/processed_all.train.pt ' \
'-save_dir ' + data_dir + 'result/ ' \
'-embedding_w2v ' + data_dir + 'train/ ' \
'-start_reinforce %s ' \
'-end_epoch %s ' \
'-critic_pretrain_epochs %s ' \
'-data_type %s ' \
'-has_attn %s ' \
'-gpus %s ' \
'> /home/wanyao/log/code_summarization/log.a2c-train_%s_%s_%s_%s_%s_g%s.test' \
'> /home/qiuyuanchen/OneDrive/Paper/CodeSum/reference/code_summarization_public/log.a2c-train_%s_%s_%s_%s_%s_g%s.test' \
% (start_reinforce, end_epoch, critic_pretrain_epochs, data_type, has_attn, gpus,
start_reinforce, end_epoch, critic_pretrain_epochs, data_type, has_attn, gpus)
print(run)
Expand All @@ -55,17 +85,18 @@ def train_a2c(start_reinforce, end_epoch, critic_pretrain_epochs, data_type, has
else:
print("failed.")
sys.exit()

def test_a2c(data_type, has_attn, gpus):
# '-load_from ' + data_dir + 'result/model_rf_hybrid_1_29_reinforce.pt ' \
run = 'python a2c-train.py ' \
'-data /media/BACKUP/ghproj_d/code_summarization/github-python/train/processed_all.train.pt ' \
'-load_from /media/BACKUP/ghproj_d/code_summarization/github-python/result/model_rf_hybrid_1_29_reinforce.pt ' \
'-embedding_w2v /media/BACKUP/ghproj_d/code_summarization/github-python/train/ ' \
'-data ' + data_dir + 'train/processed_all.train.pt ' \
'-load_from ' + data_dir + 'result/model_xent_hybrid_1_7.pt ' \
'-embedding_w2v ' + data_dir + 'train/ ' \
'-eval -save_dir . ' \
'-data_type %s ' \
'-has_attn %s ' \
'-gpus %s ' \
'> /home/wanyao/log/code_summarization/log.a2c-test_%s_%s_%s' \
'> /home/qiuyuanchen/OneDrive/Paper/CodeSum/reference/code_summarization_public/log.a2c-test_%s_%s_%s' \
% (data_type, has_attn, gpus, data_type, has_attn, gpus)
print(run)
a = os.system(run)
Expand Down
Binary file removed script/github/getComments.pyc
Binary file not shown.
Loading