<a href="https://colab.research.google.com/github/comp-aspects-of-appl-linguistics/relation-extraction-utils/blob/master/colabs/prepare-stadfordnlp-and-pss-environment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!git clone https://github.com/avst34/nlp.git

In [0]:
!wget https://download.java.net/java/GA/jdk9/9/binaries/openjdk-9_linux-x64_bin.tar.gz
!tar xzvf openjdk-9_linux-x64_bin.tar.gz

In [0]:
!pip -q install dynet

In [0]:
%cd /content/nlp

In [0]:
!git checkout -b pss-cli origin/pss-cli 

In [0]:
%%bash

cat > models/supersenses/preprocessing/corenlp.py << EOF
import os
import subprocess
import time
from tempfile import NamedTemporaryFile

import requests

CORENLP_SERVER_PORT = 9000

class CoreNLPServer(object):
    def __init__(self):
        self.handle = None

    def start(self, port):
        global CORENLP_SERVER_PORT

        print('setting CORENLP_SERVER_PORT to ', port)
        CORENLP_SERVER_PORT = port
        corenlp_home = os.path.dirname(__file__) + '/../../../corenlp/stanford-corenlp-full-2017-06-09'


        files = [x for x in os.listdir(corenlp_home) if os.path.isfile(x)]
        self.handle = subprocess.Popen(
            args=('/content/jdk-9/bin/java --add-modules java.se.ee -mx8g -cp * edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port ' + str(port) + ' -timeout 15000 ').split(),
            cwd=corenlp_home,
            stdout=open('/tmp/tmp', 'a'),
            stderr=subprocess.STDOUT
        )
        time.sleep(1)
        if self.handle.poll() is not None:
            raise Exception("Error starting CoreNLP server")

    def stop(self):
        self.handle.terminate()


def run_corenlp(tokens, format='conllu', use_server=True):

    print('in run_corenlp with port of ',CORENLP_SERVER_PORT )
    assert format in ['conllu', 'conll', 'json']
    sentence = ' '.join(tokens)
    if use_server:
        req = {
            'params': {
                'outputFormat': format,
                'ssplit.isOneSentence': 'true',
                'tokenize.whitespace': 'true',
                'annotators': "tokenize,ssplit,pos,lemma,ner,parse,dcoref,udfeats"
            },
            'data': sentence
        }
        r = requests.post('http://127.0.0.1:{:d}/'.format(CORENLP_SERVER_PORT), params=req['params'], data=req['data'].encode('utf-8'))
        out = r.text.replace('\r\n', '\n')
    else:
        input_file = NamedTemporaryFile(delete=False)
        try:
            input_file.write(sentence)
            input_file.close()
            os.system('/content/jdk-9/bin/java -mx3g edu.stanford.nlp.pipeline.StanfordCoreNLP -outputFormat ' + format + ' -filelist ' + input_file.name + ' -depparse BasicDependenciesAnnotation -ssplit.isOneSentence true -tokenize.whitespace true')
            with open(input_file + '.' + format, 'r') as f:
                out = f.read()
        finally:
            os.unlink(input_file.name)

    return out
EOF

In [0]:
!wget -nc http://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip
!mkdir -p corenlp
!unzip  -n stanford-corenlp-full-2017-06-09.zip -d corenlp


In [0]:
!wget -O /tmp/gdrive https://docs.google.com/uc?id=0B3X9GlR6EmbnWksyTEtCM0VfaFE&export=download
!chmod +x /tmp/gdrive
!/tmp/gdrive download 1Vd1QlacUaSoMMlbGe6lhaVKk9LTmJFBn
!/tmp/gdrive download 1z2D0VQlSAw9GjYdxBOsED3RC9HkY8-lx
!mv wiki.en.chunked* models/supersenses/embeddings

In [0]:
import nltk
nltk.download('punkt')

In [0]:
#### PRERATIONS:

# STEP 1: ensure 'relation_extraction_utils' is installed - as stanfordnlp is 
#         a dependency of 'relation_extraction_utils' it will be installed if 
#         necessary.
#         Additionally, stanfordnlp_resources are downloaded if nt alread present
!pip install git+https://github.com/comp-aspects-of-appl-linguistics/relation_extraction_utils.git

import os
import stanfordnlp

if not os.path.exists('/root/stanfordnlp_resources'):
  stanfordnlp.download('en', force=True)

  
# STEP 2: we'll need to access Google Drive to fetch the 'train.json' file -
#         at the end of the next snippet will be promted to click a link from
#         which the user will be able to generate an Oath token to be copied 
#         at the prompt
  
!pip install PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials


# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)


# NOTE: The id of the file assigned to 'train.json' may change
drive_id_for_train_json_file='1nPpp1zs3_0rkPkCvuz-dSK4YygxO6lw1'

# get the file into our current environment's file system
downloaded = drive.CreateFile({'id':drive_id_for_train_json_file}) 
downloaded.GetContentFile('train.json')  

In [27]:
!cat train.json | tac_to_csv --relation no_relation | head -8 | parse_ud

tcmalloc: large alloc 2401861632 bytes == 0x200fe000 @  0x7f2c5f3f71e7 0x59213c 0x4c414e 0x565b49 0x5a3761 0x4d4c06 0x4d5140 0x4d62f3 0x503091 0x506859 0x504c28 0x58650d 0x59ebbe 0x507c17 0x504c28 0x502540 0x502f3d 0x506859 0x502209 0x502f3d 0x506859 0x504c28 0x58644b 0x4ac411 0x56074d 0x50677d 0x502209 0x502f3d 0x506859 0x504c28 0x501b2e
tcmalloc: large alloc 2401861632 bytes == 0xaf396000 @  0x7f2c5f3f71e7 0x53b7df 0x53f768 0x4d515d 0x4d62f3 0x503091 0x506859 0x504c28 0x58650d 0x59ebbe 0x507c17 0x504c28 0x502540 0x502f3d 0x506859 0x502209 0x502f3d 0x506859 0x504c28 0x58644b 0x4ac411 0x56074d 0x50677d 0x502209 0x502f3d 0x506859 0x504c28 0x501b2e 0x591461 0x54b813 0x555421
tcmalloc: large alloc 2401861632 bytes == 0x13e62e000 @  0x7f2c5f3f71e7 0x53b7df 0x5244ed 0x524e8f 0x53fb91 0x4d515d 0x4d62f3 0x503091 0x506859 0x504c28 0x58650d 0x59ebbe 0x507c17 0x504c28 0x502540 0x502f3d 0x506859 0x502209 0x502f3d 0x506859 0x504c28 0x58644b 0x4ac411 0x56074d 0x50677d 0x502209 0x502f3d 0x506859 0x5