In [14]:
import os
import json
import pandas as pd

def load_acl_accepted_papers(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return set(line.strip() for line in f)

def preprocess_reviews(reviews):
    reviews_data = []
    
    for review in reviews:
        paper_id = review.get('id')
        paper_title = review.get('title')
        paper_accepted = review.get('accepted')
        
        for rev in review.get('reviews', []):
            reviews_data.append({
                'paper_id': paper_id,
                'paper_title': paper_title,
                'paper_accepted': paper_accepted,
                'reviewer_comments': rev.get('comments'),
                'originality': rev.get('ORIGINALITY'),
                'recommendation': rev.get('RECOMMENDATION'),
                'replicability': rev.get('REPLICABILITY'),
                'presentation_format': rev.get('PRESENTATION_FORMAT'),
                'clarity': rev.get('CLARITY'),
                'meaningful_comparison': rev.get('MEANINGFUL_COMPARISON'),
                'substance': rev.get('SUBSTANCE'),
                'reviewer_confidence': rev.get('REVIEWER_CONFIDENCE'),
                'soundness_correctness': rev.get('SOUNDNESS_CORRECTNESS'),
                'appropriateness': rev.get('APPROPRIATENESS'),
                'impact': rev.get('IMPACT')
            })
    
    return pd.DataFrame(reviews_data)

def load_conference_data(conference, data_type, acl_accepted_papers):
    data_dir = os.path.join(base_dir, conference, data_type, 'reviews')
    all_data = []

    for filename in os.listdir(data_dir):
        if filename.endswith('.json'):
            with open(os.path.join(data_dir, filename), 'r', encoding='utf-8') as f:
                data = json.load(f)
                if conference == 'acl_2017':
                    # Load accepted papers from separate file
                    data['accepted'] = data['id'] in acl_accepted_papers
                elif conference == 'conll_2016':
                    data['accepted'] = True
                elif conference == 'iclr_2017':
                    data['accepted'] = data.get('accepted', False)
                all_data.append(data)

    return preprocess_reviews(all_data)

def combine_data():
    conferences = ['acl_2017', 'conll_2016', 'iclr_2017']
    acl_accepted_file = r'C:\Users\TANAYA\Documents\GitHub\PeerReviews\data\acl_accepted.txt'
    acl_accepted_papers = load_acl_accepted_papers(acl_accepted_file)

    train_df = pd.concat([load_conference_data(conf, 'train', acl_accepted_papers) for conf in conferences], ignore_index=True)
    test_df = pd.concat([load_conference_data(conf, 'test', acl_accepted_papers) for conf in conferences], ignore_index=True)
    dev_df = pd.concat([load_conference_data(conf, 'dev', acl_accepted_papers) for conf in conferences], ignore_index=True)

    return train_df, test_df, dev_df

base_dir = r'C:\Users\TANAYA\Documents\GitHub\PeerReviews\data'

train_df, test_df, dev_df = combine_data()



In [17]:
print("Training DataFrame:")
train_df




Training DataFrame:


Unnamed: 0,paper_id,paper_title,paper_accepted,reviewer_comments,originality,recommendation,replicability,presentation_format,clarity,meaningful_comparison,substance,reviewer_confidence,soundness_correctness,appropriateness,impact
0,104,Bridge Text and Knowledge by Learning Multi-Pr...,False,- Strengths:\n* Outperforms ALIGN in supervise...,3,3,,Poster,3,2,4,3,4,5,3
1,104,Bridge Text and Knowledge by Learning Multi-Pr...,False,This paper addresses the problem of disambigua...,3,4,,Poster,3,2,4,4,4,5,3
2,104,Bridge Text and Knowledge by Learning Multi-Pr...,False,"- Strengths:\nGood ideas, simple neural learni...",3,4,,Oral Presentation,3,2,4,4,4,4,3
3,105,Morphological Inflection Generation with Hard ...,False,- Strengths:\nThe idea of hard monotonic atten...,3,3,,Poster,5,2,4,4,4,5,3
4,105,Morphological Inflection Generation with Hard ...,False,- Strengths: A new encoder-decoder model is pr...,3,3,,Oral Presentation,5,2,4,3,4,5,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6216,793,Surprisal-Driven Feedback in Recurrent Networks,False,"\nThis paper proposes to leverage ""surprisal"" ...",,3.0,,,,,,5.0,,,
6217,793,Surprisal-Driven Feedback in Recurrent Networks,False,Source Code (C++/CUDA) for reproducing the res...,,,,,,,,,,,
6218,793,Surprisal-Driven Feedback in Recurrent Networks,False,,,,,,,,,,,,
6219,793,Surprisal-Driven Feedback in Recurrent Networks,False,,,,,,,,,,,,


In [18]:
print("\nTest DataFrame:")
test_df




Test DataFrame:


Unnamed: 0,paper_id,paper_title,paper_accepted,reviewer_comments,originality,recommendation,replicability,presentation_format,clarity,meaningful_comparison,substance,reviewer_confidence,soundness_correctness,appropriateness,impact
0,148,Evaluation Metrics for Machine Reading Compreh...,False,- Strengths:\n\n- this article puts two fields...,3,4,,Oral Presentation,2,3,4,3,4,4,2
1,323,A Neural Local Coherence Model,False,The paper introduces an extension of the entit...,5,4,,Poster,4,3,4,3,5,5,3
2,323,A Neural Local Coherence Model,False,The paper proposes a convolutional neural netw...,5,3,,Poster,4,3,3,3,5,5,3
3,355,Neural Modeling of Multi-Predicate Interaction...,False,- Strengths:\n\nThis paper presents a sophisti...,5,4,,Oral Presentation,4,3,4,3,5,5,3
4,355,Neural Modeling of Multi-Predicate Interaction...,False,This paper proposes new prediction models for ...,5,4,,Oral Presentation,4,3,4,5,5,5,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
661,778,Ternary Weight Decomposition and Binary Activa...,False,I do need to see the results in a clear table....,,5.0,,,,,,3.0,,,
662,778,Ternary Weight Decomposition and Binary Activa...,False,This paper explores a new quantization method ...,,4.0,,,,,,4.0,,,
663,778,Ternary Weight Decomposition and Binary Activa...,False,I suggest to refer the following two papers.\n...,,,,,,,,,,,
664,778,Ternary Weight Decomposition and Binary Activa...,False,,,,,,,,,,,,


In [20]:
pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 3.4 MB/s eta 0:00:00
Collecting click
  Downloading click-8.1.7-py3-none-any.whl (97 kB)
     ---------------------------------------- 97.9/97.9 kB 5.5 MB/s eta 0:00:00
Collecting regex>=2021.8.3
  Downloading regex-2024.7.24-cp310-cp310-win_amd64.whl (269 kB)
     -------------------------------------- 269.7/269.7 kB 3.3 MB/s eta 0:00:00
Installing collected packages: regex, click, nltk
Successfully installed click-8.1.7 nltk-3.8.1 regex-2024.7.24
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize
    tokens = tokenizer.tokenize(text)
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back into string
    processed_text = ' '.join(tokens)
    
    return processed_text

# Apply preprocessing to the reviewer_comments column
train_df['cleaned_reviewer_comments'] = train_df['reviewer_comments'].apply(preprocess_text)

# Display the first few rows to check the results
train_df[['reviewer_comments', 'cleaned_reviewer_comments']].head()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TANAYA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TANAYA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\TANAYA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,reviewer_comments,cleaned_reviewer_comments
0,- Strengths:\n* Outperforms ALIGN in supervise...,strength outperforms align supervised entity l...
1,This paper addresses the problem of disambigua...,paper address problem disambiguatinglinking te...
2,"- Strengths:\nGood ideas, simple neural learni...",strength good idea simple neural learning inte...
3,- Strengths:\nThe idea of hard monotonic atten...,strength idea hard monotonic attention new sub...
4,- Strengths: A new encoder-decoder model is pr...,strength new encoderdecoder model proposed exp...


In [22]:
train_df['cleaned_reviewer_comments'].head()

0    strength outperforms align supervised entity l...
1    paper address problem disambiguatinglinking te...
2    strength good idea simple neural learning inte...
3    strength idea hard monotonic attention new sub...
4    strength new encoderdecoder model proposed exp...
Name: cleaned_reviewer_comments, dtype: object

In [23]:
# Display a specific entry for cleaned_reviewer_comments
def display_full_review(index):
    try:
        review_text = train_df.loc[index, 'cleaned_reviewer_comments']
        print(f"Full Review at index {index}:\n")
        print(review_text)
    except IndexError:
        print("Index out of range.")
        
# Specify the index you want to view
index_to_view = 0  # Change this to the index you want to view
display_full_review(index_to_view)


Full Review at index 0:

strength outperforms align supervised entity linking task suggests proposed framework improves representation text knowledge learned jointly direct comparison closely related approach using similar input data analysis smoothing parameter provides useful analysis since impact popularity persistent issue entity linking weakness comparison align could better align used content window size 10 v paper 5 vector dimension 500 v paper 200 also clear whether ne_j includes entity link e_j graph directed consists wikipedia outlinks adjacency defined would undirected graph align context entity set entity link entity ne_j different cannot tell much impact change learned vector could contribute difference score entity similarity task sometimes difficult follow whether mention mean string type particular mention particular document phrase mention embedding used appears embeddings learned mention sens difficult determine impact sense disambiguation order without comparison uns