In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 2000)

from predpatt import PredPatt
from predpatt import load_conllu
from predpatt import PredPattOpts

In [2]:
somedata = pd.read_csv('corpus_some-master/data/some_database.csv', sep='\t')
somedata.head()

Unnamed: 0,Item,workerid,Rating,Partitive,StrengthSome,Mention,Subjecthood,Modification,Sentence,SentenceLength,Trial
0,176651:43,22,5,yes,5.7,new,other,modified,"a, a contractor friend helped me, uh, with some of the, uh, foundation work",14,1
1,176651:43,25,4,yes,5.7,new,other,modified,"a, a contractor friend helped me, uh, with some of the, uh, foundation work",14,1
2,176651:43,33,7,yes,5.7,new,other,modified,"a, a contractor friend helped me, uh, with some of the, uh, foundation work",14,1
3,176651:43,9,3,yes,5.7,new,other,modified,"a, a contractor friend helped me, uh, with some of the, uh, foundation work",14,1
4,176651:43,18,7,yes,5.7,new,other,modified,"a, a contractor friend helped me, uh, with some of the, uh, foundation work",14,1


In [3]:
uniq = somedata[['Item', 'Sentence']].drop_duplicates('Item')
uniq = uniq.set_index('Item')
print(uniq.shape)

(1362, 1)


In [4]:
import stanza
from stanza.utils.conll import CoNLL

document = '\n\n'.join(uniq['Sentence'].tolist()) 
nlp = stanza.Pipeline(lang='en', tokenize_no_ssplit=True)
doc = nlp(document)
dic = doc.to_dict()
conllu = CoNLL.convert_dict(dic)

with open('corpus_some-master/allsents.conllu', 'w') as f:
    for sent in conllu:
        for word in sent:
            txt = "\t".join(word) + "\n"
            f.write(txt)
        f.write("\n")

2020-07-20 15:31:40 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| ner       | ontonotes |

2020-07-20 15:31:40 INFO: Use device: cpu
2020-07-20 15:31:40 INFO: Loading: tokenize
2020-07-20 15:31:40 INFO: Loading: pos
2020-07-20 15:31:41 INFO: Loading: lemma
2020-07-20 15:31:41 INFO: Loading: depparse
2020-07-20 15:31:42 INFO: Loading: ner
2020-07-20 15:31:43 INFO: Done loading processors!


In [5]:
options = PredPattOpts(resolve_relcl=True, borrow_arg_for_relcl=True, resolve_conj=False, cut=True)
with open('corpus_some-master/allsents.conllu', 'r') as f:
    data = f.read()
    parsed = [PredPatt(ud_parse, opts=options) for sent_id, ud_parse in load_conllu(data)]

In [6]:
from collections import Counter
deprels = []
nps = []
for sent in parsed:
    someind = [a.text.lower() for a in sent.tokens].index('some')
    if not sent.tokens[someind].dependents:
        np = [a.gov.text for a in sent.tokens[someind].gov.dependents if (a.dep.text.lower()=='some' 
                                                                      and 
                                                                      a.rel in ['det', 'obj', 'nsubj', 
                                                                                'nmod', 'conj'])] 
#         nps = [a.rel for a in sent.tokens[someind].gov.dependents if a.dep.text.lower()=='some']
#         deprels += [a.rel for a in sent.tokens[someind].gov.dependents if a.dep.text.lower()=='some']
    else:
        np = [a.dep.text for a in sent.tokens[someind].dependents if a.rel in ['nsubj','nmod', 'conj']] 
#     deprels += [a.rel for a in sent.tokens[someind].dependents]
    nps.append(' '.join(np))

In [7]:
uniq['NP'] = nps

In [8]:
somedata['NP'] = somedata['Item'].apply(lambda x: uniq.loc[x, 'NP'])

In [9]:
# somedata.to_csv('corpus_some-master/somedata_nps.tsv', sep="\t", index=False)

In [10]:
# Simple way to figure out if its countable or not
# uniq['Count'] = uniq['NP'].apply(lambda x: x[-1]=='s' if x else False)
# uniq.to_csv('corpus_some-master/somedata_npscount.tsv', sep="\t")

In [11]:
# For now ignore all the words which are more than one word long
uniq['len'] = uniq['NP'].apply(lambda x: len(x.split()))
uniq = uniq[uniq['len']==1]


lemmatizer = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')

uniq['lemma'] = uniq['NP'].apply(lambda x: lemmatizer(x).sentences[0].words[0].lemma)

2020-07-20 15:34:16 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |

2020-07-20 15:34:16 INFO: Use device: cpu
2020-07-20 15:34:16 INFO: Loading: tokenize
2020-07-20 15:34:16 INFO: Loading: pos
2020-07-20 15:34:17 INFO: Loading: lemma
2020-07-20 15:34:17 INFO: Done loading processors!


In [12]:
celex = pd.read_csv('corpus_some-master/celex.csv')
celex=celex.set_index('Noun')

def count_mass(col, word):
    if word in celex.index:
        return celex.loc[word, col]
    else:
        return 'NF'

In [13]:
uniq['Count'] = uniq['lemma'].apply(lambda x: count_mass(col='Celex Countable', word=x))
uniq['Mass'] = uniq['lemma'].apply(lambda x: count_mass(col='Celex Uncountable', word=x))

In [14]:
uniq

Unnamed: 0_level_0,Sentence,NP,len,lemma,Count,Mass
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
176651:43,"a, a contractor friend helped me, uh, with some of the, uh, foundation work",work,1,work,Y,Y
108182:30,i guess you have to take some things and leave some things.,things,1,thing,Y,N
20718:26,"well, i know that we have some relatives that live around, like the mumblex area in there,",relatives,1,relative,Y,N
18335:127,"just if you want a lot of, you know, light for underneath, that may be the, all right because we had, because we have some, we have some old fruit trees.",have,1,have,NF,NF
88808:43,"although, um, i have also done oh, free lance magazine writing and some educational writing.",writing,1,writing,Y,Y
148561:14,"i mean, some people sing,",people,1,people,Y,Y
176908:22,"and you say you have some strong feelings about, uh, the issue.",feelings,1,feeling,Y,Y
34501:47,"but it was on a higher level, you know, where there were some things that kids didn't understand that adults did ,",things,1,thing,Y,N
1686:32,"and, uh, quite honestly, i, i have some little children",children,1,child,Y,N
9196:49,"but i do think that, um, congress has backed down much too much on some of the air pollution standards.",standards,1,standard,Y,N


In [15]:
# For now ignore all the words which are more than one word long

# somedata['NP'] = somedata['Item'].apply(lambda x: uniq.loc[x, 'NP'])
somedata['len'] = somedata['NP'].apply(lambda x: len(x.split()))
somedata = somedata[somedata['len']==1]
somedata['Count'] = somedata['Item'].apply(lambda x: uniq.loc[x, 'Count'])
somedata['Mass'] = somedata['Item'].apply(lambda x: uniq.loc[x, 'Mass'])
# somedata = somedata[(somedata['Count']!='NF')|(somedata['Mass']!='NF')]

In [16]:
repl = {'Y':'yes', 'N':'no'}
somedata = somedata.replace({"Count":repl, "Mass":repl})

In [17]:
somedata['Lemma'] = somedata['Item'].apply(lambda x: uniq.loc[x, 'lemma'])
somedata.to_csv('corpus_some-master/masscount.tsv', sep='\t', index=False)

In [18]:
Counter(somedata['Mass'])

Counter({'yes': 6800, 'no': 4530, 'NF': 1580})

In [19]:
Counter(somedata['Count'])

Counter({'yes': 9970, 'NF': 1580, 'no': 1360})

In [20]:
somedata.shape

(12910, 16)

In [22]:
somedata['Lemma'].unique().shape

(509,)

In [38]:
somedata[(somedata['Mass']=='yes') & (somedata['Count']=='no')]['Lemma'].unique()

array(['information', 'money', 'equipment', 'skating', 'music',
       'interference', 'harassment', 'charcoal', 'esteem', 'good', 'golf',
       'compassion', 'credence', 'cooking', 'independence', 'relevance',
       'little', 'latitude', 'steel', 'ooze', 'furniture', 'butter',
       'knowledge', 'wealth', 'broccoli', 'sewing', 'peace', 'sternness',
       'feedback', 'mayonnaise', 'gardening', 'expertise', 'cocaine',
       'continuity', 'data', 'hardware', 'wallpaper', 'discrimination',
       'loam', 'suspense', 'marketing', 'news', 'stability', 'socialism',
       'shopping', 'litigation', 'diplomacy', 'software', 'firewood',
       'training'], dtype=object)

In [96]:
somedata[somedata['Lemma']=='continuity']

Unnamed: 0,Item,workerid,Rating,Partitive,StrengthSome,Mention,Subjecthood,Modification,Sentence,SentenceLength,Trial,NP,len,Count,Mass,Lemma
8500,77906:47,99,1,no,5.4,new,other,unmodified,"but, uh, i-, it seems that there should be some continuity,",11,20,continuity,1,no,yes,continuity
8501,77906:47,170,2,no,5.4,new,other,unmodified,"but, uh, i-, it seems that there should be some continuity,",11,20,continuity,1,no,yes,continuity
8502,77906:47,115,2,no,5.4,new,other,unmodified,"but, uh, i-, it seems that there should be some continuity,",11,20,continuity,1,no,yes,continuity
8503,77906:47,103,2,no,5.4,new,other,unmodified,"but, uh, i-, it seems that there should be some continuity,",11,20,continuity,1,no,yes,continuity
8504,77906:47,187,1,no,5.4,new,other,unmodified,"but, uh, i-, it seems that there should be some continuity,",11,20,continuity,1,no,yes,continuity
8505,77906:47,148,2,no,5.4,new,other,unmodified,"but, uh, i-, it seems that there should be some continuity,",11,20,continuity,1,no,yes,continuity
8506,77906:47,240,1,no,5.4,new,other,unmodified,"but, uh, i-, it seems that there should be some continuity,",11,20,continuity,1,no,yes,continuity
8507,77906:47,78,2,no,5.4,new,other,unmodified,"but, uh, i-, it seems that there should be some continuity,",11,20,continuity,1,no,yes,continuity
8508,77906:47,131,1,no,5.4,new,other,unmodified,"but, uh, i-, it seems that there should be some continuity,",11,20,continuity,1,no,yes,continuity
8509,77906:47,191,3,no,5.4,new,other,unmodified,"but, uh, i-, it seems that there should be some continuity,",11,20,continuity,1,no,yes,continuity


In [87]:
somedata[somedata['Item']=='154338:10']['Rating'].mean()

6.9

79557:42
148561:14
33466:11
89238:15	