<a href="https://colab.research.google.com/github/sunkipratiksha/NLP-programs/blob/main/UN_DEBATE_ANALYSIS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


In [None]:
# Check the content in the directory
!ls '/content/drive/My Drive/NLP PROGRAMS'

In [None]:
# Unzip the file
!unzip '/content/drive/My Drive/UNGDC_1970-2018.zip'


In [None]:
# Check the content in the converted sessions directory
!ls 'Converted sessions'


In [None]:

# Check the content in the specific session directory
!ls 'Converted sessions/Session 25 - 1970'


In [None]:
import glob
import pandas as pd
import re

# Retrieve the list of folders matching the pattern 'Converted sessions/Session*'
folders = glob.glob('Converted sessions/Session*')
print(folders)

In [None]:
# Create an empty dataframe with columns
df = pd.DataFrame(columns=['COUNTRY', 'SPEECH', 'SESSION', 'YEAR'])
print(df)


In [None]:
i=0

 #iterates over each folder path in the folders list.
for file in folders:

  #glob.glob() function - again to search for speech file paths that match the pattern file+"/USA*.txt" i.e  start with 'USA' and end with '.txt' 
  speech = glob.glob(file+"/USA*.txt")

  #The code opens the first speech file in the speech list using open(speech[0], encoding="utf-8")
  # Populate the dataframe with speech data
  with open(speech[0],encoding="utf-8") as f:
    df.loc[i,'SPEECH'] = f.read()
    df.loc[i,'YEAR'] = speech[0].split('_')[-1].split('.')[0]
    df.loc[i,'SESSION'] =speech[0].split('_')[-2]
    df.loc[i,'COUNTRY'] = speech[0].split('_')[0].split("/")[-1]
    i+=1

df.head()

In [None]:
df['SPEECH'][0]

In [None]:
# TEXT CLEANING
def cleaned(text):
    # Remove new line characters
    text = re.sub('\n', ' ', text)
    # Remove punctuations except periods(.) and question marks(?)
    text = re.sub(r"[^\w\s.?]", '', text)
    # Remove numbers
    text = re.sub(r'\t', '', text)
    text = re.sub(r'^\d+\.', '', text)
    
    return text

#assigning the result of applying the cleaned function to the 'Speech' column of the DataFrame df to a new column called 'CLEANED SPEECH'.
df['CLEANED SPEECH'] = df['SPEECH'].apply(cleaned)

In [None]:
print(df['CLEANED SPEECH'])  #this is cleaned speech


In [None]:

# SENTENCE SEGMENTATION
def sentences(text):
    # #SPLITTING INTO SENTENCES BY FULLSTOPS OR QUEST MARKS(?)
    text = re.split('[.?]', text)
    clean_sent = []
    for sent in text:
        clean_sent.append(sent)
    return clean_sent

#assigning the result of applying the "sentences" function to the 'CLEANED SPEECH' column of the DataFrame df to a new column called 'SENTENCES'.
df["SENTENCES"] = df['CLEANED SPEECH'].apply(sentences)

In [None]:
print(df["SENTENCES"])

In [None]:
# Print a sample sentence from the 'SENTENCES' column
df["SENTENCES"][1]    

In [None]:
df.shape

In [None]:
# CREATE A DATAFRAME THAT CONTAINS SENTENCES AND ITS WORDCOUNT

In [None]:
 # CREATE A DATAFRAME THAT CONTAINS SENTENCES AND ITS WORDCOUNT
df2 = pd.DataFrame(columns=['YEAR','SENTENCES','WORDCOUNT'])
row_list=[]

# Iterate over each row in the dataframe
for i in range(len(df)):
  #iterate over each sentence from 'SENTENCE' column of the current row:
  for sent in df.loc[i,'SENTENCES']:
    wordcount = len(sent.split())   #  splits the sentence at each whitespace character (space, tab, newline) and returns a list of the individual words in the sentence.
    year = df.loc[i,'YEAR']
    # Create a dictionary with sentence, word count, and year
    dict1 = {'YEAR': year, 'SENTENCES': sent, 'WORDCOUNT': wordcount}
    # Append the dictionary to the row_list
    row_list.append(dict1)

df2 = pd.DataFrame(row_list)

In [None]:
# Display the first few rows of df2
print(df2.head())

#Sample texts from dataset

1. For that reason, **President Reagan**, in his speech to this body last year, proposed that the United States and the Soviet Union exchange visits of experts at test sites to measure directly the yields of nuclear weapon test

2. **President Reagan** has directed our scientists and engineers to examine, in the light of new technologies and fully in accord with the Anti Ballistic Missile Treaty the feasibility of defense against ballistic missile attack

3. **President Reagan** approach to foreign policy is grounded squarely on standards drawn from the pragmatic American experience 4. Let me start by joining the **President of Brazi**l in conveying to the people and Government of Mexico our deep sympathy over the devastation wrought by earthquakes and our solidarity with them as they work to recover and rebuild

In [None]:
#IMPORT SPACY AND MATCHER LIBRARY
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")

In [None]:
# Define a function to find the president's name in a given text
def find(text):
  names=[]           # Initialize an empty list to store the names of presidents
  doc = nlp(text)   # Process the sentence with the loaded spaCy model


  #pattern :
  pattern1 = [
      {'LOWER':'president'},   # Match the lowercase word 'president'
      {'POS':'ADP','OP':'?'},   # Match an optional adposition (preposition or subordinating conjunction)
      {'POS':'PROPN'}             # Match a proper noun (name)
  ]

  #1. Initialize spacy Matcher object:
  m = Matcher(nlp.vocab)

  #2. Add defined pattern to the Matcher object(m):
  m.add('1',[pattern1])

  #3. pattern matching on the processed sentence
  matches = m(doc)

  for matches_id,start,end in matches:    # Iterate over the matched patterns
    n = doc[start:end]    # Extract the matched span from the document

    names.append(n.text)   # Append the text of the matched span to the names list
  return names   # Return the list of president names found in the sentences


In [None]:
# Extract presidents' names from the sentences in df2
extracted = []
for i in range(df2.shape[0]):
    extracted.append(find(df2['SENTENCES'][i]))
df2['PRESIDENT']=extracted

In [None]:
print(df2.columns) 
#the new column 'PRESIDENT' is added in df2 dataframe

In [None]:
print(df2.head())

here the **'PRESIDENT'** column is null, since all sentences may not contain the pattern

In [None]:
#mention all the non-null outputs :
mention = []
for i in range(df2.shape[0]):
  if df2['PRESIDENT'][i]!=[]:
    mention.append(df2['PRESIDENT'][i])

In [None]:
mention  #shows all presidents name 

In [None]:
# Create a list to store sentences containing presidents' names
president_sent = []
for i in range(df2.shape[0]):
    if df2['PRESIDENT'][i] != []:
        president_sent.append(df2['SENTENCES'][i])


In [None]:
president_sent[0]

In [None]:
len(president_sent)

......................................................................................................................................................................................................................................

Sample sentences from dataset:
these are the important programs used in the speech:

1. The United States will continue its strong efforts to advance **the United Nations plan** for Namibia

2. The United States and Italy have proposed **a Global Peace Operations Initiative**

3. For 35 years, **the North Atlantic alliance** has guaranteed the peace in Europe

**keywords**: plan , programme , scheme , compaign ,  initiative , conference , 
          agreement , alliance

pattern: 
1.   the | DET | det
2.   United | PROPN | compound
3.   Nations | PROPN | compound
4.   plan | NOUN | dobj




In [None]:
import re

# Define a function to check if keywords are present in a text
def keywords(text):
    #words = ['plan', 'programme', 'scheme', 'campaign', 'initiative', 'conference', 'agreement', 'alliance']
    words = [r'\b(?i)'+'plan'+r'\b',
              r'\b(?i)'+'programme'+r'\b',
              r'\b(?i)'+'scheme'+r'\b',
              r'\b(?i)'+'campaign'+r'\b',
              r'\b(?i)'+'initiative'+r'\b',
              r'\b(?i)'+'conference'+r'\b',
              r'\b(?i)'+'agreement'+r'\b',
              r'\b(?i)'+'alliance'+r'\b'] 

    output=[]
    count = 0
    for i in words:
        if re.search(i, text) != None:
            count = 1
            break
    return count

# Add a new column 'KEYWORDS' to df2 indicating if keywords are present in the sentences
df2['KEYWORDS'] = df2['SENTENCES'].apply(keywords)
    
    

In [None]:
print(df2.columns)

In [None]:
# Display the first few rows of df2
df2.head

here the **'KEYWORDS'** column is null, since all sentences may not contain the keywords defined in the list


In [None]:
# identify specific keywords related to schemes or initiatives mentioned in the text.

def all_keywords(text,check):
    schemes = []   # Initialize an empty list to store the extracted keywords

    doc = nlp(text)   # Process the input text with the loaded spaCy model
    word_list = ['plan', 'programme', 'scheme', 'campaign', 'initiative', 'conference', 'agreement', 'alliance']

    # Define the pattern to match the keywords
    pattern2 = [
        {'POS': 'DET'},                        # Matches a determiner
        {'POS': 'PROPN', 'DEP': 'compound'},      # Matches a proper noun with a compound dependency
        {'POS': 'PROPN', 'DEP': 'compound'},        # Matches a proper noun with a compound dependencY
        {'POS': 'PROPN', 'OP': '?'},              #Matches an optional proper noun
        {'POS': 'PROPN', 'OP': '?'},               #Matches an optional proper noun
        {'POS': 'PROPN', 'OP': '?'},               #Matches an optional proper noun
       {'LOWER': {'IN': word_list}, 'OP': '+'}         # Matches any of the specified keywords one or more times
    ] 

    if check==0:
      return schemes #return blank list

    # Initialize spacy Matcher object
    m = Matcher(nlp.vocab)

    # Add defined pattern to the Matcher object
    m.add('pattern2', [pattern2])

    # Perform pattern matching on the processed text
    matches = m(doc)

    for i in range(0,len(matches)):  # get the match ID, the starting position, and the ending position of the match.
        start = matches[i][1]
        end = matches[i][2]
        if doc[start].pos_ == 'DET':  #  checks if the word at the starting position of the match is a determiner (like "the," "a," or "an").

            #  If the word at the starting position is a determiner, we increment the starting position by 1. 
            # This is because we want to skip the determiner and start the keyword from the next word.
            start += 1

        # creates a string called "span" by joining the words from the starting position to the ending position of the match. It represents the extracted keyword from the text.
        span = str(doc[start:end])

        # checks if the schemes list is not empty and 
        # if the last keyword in the list is already present in the current span. It helps us avoid adding duplicate keywords.
        if (len(schemes) != 0) and (schemes[-1] in span):

            #  If the last keyword in the list is already present in the current span, we update it with the new span.
            schemes[-1] = span

        #  If the current span is not a duplicate, we add it to the schemes list.    
        else:
            schemes.append(span)

    return schemes

#df2['SCHEMES1'] = df2.apply(lambda x: all_keywords(x['SENTENCES'], x['KEYWORDS']), axis=1)

# Add a new column 'SCHEMES1' to df2 containing the extracted keywords
df2['SCHEMES1']=df2.apply(lambda x: all_keywords(x.SENTENCES,x.KEYWORDS),axis=1)      

#df2['SCHEMES1'] = df2['KEYWORDS'].astype(str).apply(all_keywords)

#astype() function in the code is used to change the data type of a column in a pandas DataFrame



In [None]:
print(df2.columns)

In [None]:
df2.head()

In [None]:
df2['KEYWORDS']

In [None]:
mention = []  # Initialize an empty list to store mentions

if 'SCHEMES1' in df2.columns:  # Check if 'SCHEMES1' column exists in df2
    
    for i in range(df2.shape[0]):      # Iterate over each row in df2
       
        if df2['SCHEMES1'][i] != []:            # Check if the value in the 'SCHEMES1' column at index i is not an empty list
           
            mention.append(df2['SCHEMES1'][i])     # Append the value to the 'mention' list



In [None]:
mention

In [None]:
df2.head()

In [None]:
print(df2.columns)

In [None]:
initiatives = []  # Initialize an empty list to store initiatives

# Iterate over each row in df2
for i in range(df2.shape[0]):

    # Check if the value in the 'SCHEMES1' column at index i is not an empty list
    if df2['SCHEMES1'][i] != []:
      
        # Append the value in the 'SENTENCES' column at index i to the 'initiatives' list
        initiatives.append(df2['SENTENCES'][i])



In [None]:
initiatives

In [None]:
len(initiatives)

# RELATION EXTRACTION PART 1:

**Sample sentences from dataset:**

1. The United **States** will **support** these **principles**

2. Yet these very small **entities need** more than most the **assistance** that the United Nations system can provide

3. I have proposed to Congress that the United **States provide** additional **funding** for our work in Iraq, the greatest financial commitment of its kind since the Marshall Plan

In [None]:
def RULE_1(text):
  doc=nlp(text)    # uses the spaCy model (nlp) to process the input text and store the result in the variable doc
  sent=[]         #sent is initialized to store the extracted phrases.
  for token in doc:   # starts a loop that iterates over each token in the processed doc.
    if token.pos_=='VERB':   #checks if the current token's part-of-speech (POS) tag is 'VERB'. If it is, the code proceeds to the next steps. Otherwise, it skips the token.
      phrase=""      #empty string phrase to build the extracted phrase for the current verb.
      for sub_token in token.lefts:   #starts a loop that iterates over the left children of the current verb token.
        if (sub_token.dep_ in ['nsubj','nsubjpass'] and (sub_token.pos_ in ['NOUN','PROPN','PRON'])):  #checks if the current left child's dependency label is either 'nsubj' or 'nsubjpass', and 
                                                                                                        #its POS tag is either 'NOUN' or 'PROPN'. If both conditions are satisfied, the code proceeds to the next steps. Otherwise, it skips the left child.
          
          phrase += sub_token.text + ' ' + token.lemma_   #  appends the text of the left child, followed by a space and the lemma (base form) of the current verb token, to the phrase string.

          for sub_token in token.rights:    #starts a loop that iterates over the right children of the current verb token.

            if (sub_token.dep_ in ['dobj']) and (sub_token.pos_ in ['NOUN','PROPN']):   #checks if the current right child's dependency label is 'dobj' and its POS tag is either 'NOUN' or 'PROPN'.
                                                                                         #If both conditions are satisfied, the code proceeds to the next steps. Otherwise, it skips the right child.
              phrase+=' '+sub_token.text   # appends a space and the text of the right child to the phrase string.
              sent.append(phrase)  # appends the completed phrase to the sent list.
  return sent    # Returns the sent list containing all the extracted phrases for the given text.






In [None]:


rowlist=[]  #used to store information about each row.
for i in range(len(df2)):   #execute once for each row in df2.

  sent=df2.loc[i,'SENTENCES']    # takes the sentence from the 'SENTENCES' column and assigns it to the variable 'sent'. This represents the sentence for that row.
  year = df2.loc[i,'YEAR']         #  takes the year from the 'YEAR' column and assigns it to the variable 'year'. This represents the year for that row.
  output = RULE_1(sent)         # extract1() function is called with the sentence 'sent' as input
                                  # This function analyzes the sentence and extracts relevant information.
                                  # The extracted information is stored in the output variable as a list.

  dict1 = {'YEAR':year,'SENTENCES':sent,'OUTPUT':output}   # A dictionary called dict1 is created to hold the year, sentence, and extracted information.
  rowlist.append(dict1)  # dict1 dictionary is added to the rowlist.

df3 = pd.DataFrame(rowlist)  # rowlist is converted into a DataFrame called df3
#print(df3.head())



In [None]:
print(df3.head())

In [None]:
df_show = pd.DataFrame(columns = df3.columns)  # An empty DataFrame called df_show is created with the same columns as the df3 DataFrame.

for row in range(len(df3)):  # The loop will execute once for each row in df3.

  if len(df3.loc[row,'OUTPUT'])!=0:         # checks if the length of the 'OUTPUT' column in the current row of df3 is not equal to zero.
    df_show = df_show.append(df3.loc[row,:])  # If the condition is true, it means that the row has non-empty output, so the entire row is appended to the df_show DataFrame.

df_show.reset_index(inplace=True, drop=True)   # the index of the df_show DataFrame is reset, and the inplace parameter is set to True,
                                               # which means the operation is performed on the DataFrame itself rather than returning a new DataFrame.



In [None]:
print(df_show.head())

In [None]:
df3.columns

In [None]:
df_show.columns

In [None]:
df_show.shape

In [None]:

#SEPARATE SUBJECT, VERB AND OBJECT
verb_dict= {}   #used to store the count of each verb encountered.
separated_components = {}  # used to store the separated components (subject, verb, and object) of each sentence.
components = []
for i in range(len(df_show)):   # iterates over the rows of the DataFrame df_show.

  #  retrieve the values of the 'SENTENCES', 'YEAR', and 'OUTPUT' columns for the current row of df_show
  sent = df_show.loc[i,'SENTENCES']
  year = df_show.loc[i,'YEAR']
  output = df_show.loc[i,'OUTPUT']

  for sent in output:  # iterates over each sentence in the output list.
  
    n1 = sent.split()[0]   # Split the sentence into words and assign the first word to n1 (subject)
    v = sent.split()[1]      # Split the sentence into words and assign the second word to v (verb)
    n2 = sent.split()[2]   # Split the sentence into words and assign the remaining words to n2 (object)

    # Create a dictionary to store the separated components for the current sentence
    separated_components = {'YEAR':year,'SENTENCES':sent,'NOUN1':n1 , 'VERB':v , 'NOUN2':n2}

    # Append the separated components dictionary to the list of components
    components.append(separated_components)

    # Count the occurrence of the verb in the verb dictionary

    verb = sent.split()[1]
    if verb in verb_dict:
      verb_dict[verb]+=1
    else:
      verb_dict[verb]=1

df_sep = pd.DataFrame(components)
df_sep.head(10)
  

In [None]:
df_sep[df_sep['VERB']=='support'].head()

In [None]:
df_sep['VERB'].value_counts()[0:10]


# RELATION EXTRACTION PART 2:

Sample sentences from the dataset:

1. With support from **many countries**, we have made **impressive progress**

2. Because of their **unique expertise** and regional legitimacy, they can be instruments for solving some of the **hardest challenges** we face

3. We are right to aim high and take on the **mightiest tasks**

In [None]:
from spacy import displacy
displacy.render(nlp("With support from many countries, we have made impressive progress"),style='dep',jupyter=True)
doc = nlp("With support from many countries, we have made impressive progress")


In [None]:
def RULE_2(text):
  doc = nlp(text)
  pattern = []  #Initialize an empty list to store the extracted patterns.
  for token in doc:
    phrase = ''  # Initialize an empty string to store the extracted pattern for the current token.

    #Check if the current token is a noun and its dependency is one of the specified values ('dobj', 'pobj', 'nsubj', 'nsubjpass')
    #This condition ensures that we consider nouns that function as objects or subjects.
    if ((token.pos_ == 'NOUN') and (token.dep_ in ['dobj','pobj','nsubj','nsubjpass'])):

      #Iterate over the children of the current token.
      # used to check each word and ,if that word  has any other words directly connected to it.
      for subtoken in token.children:

        #Check if the child token is an adjective or has a "compound" dependency.
        if(subtoken.pos_=='ADJ') or (subtoken.dep_=='compound'):

          #If the child token satisfies the condition, concatenate its text with the existing phrase, separated by a space.
          phrase = phrase + subtoken.text+' '

      if len(phrase)!=0:    # Check if the phrase is not empty.
        phrase = phrase + token.text+' '

    # If the phrase is not empty, concatenate the current token's text to the phrase, separated by a space.
    if len(phrase)!=0:

      # If the phrase is not empty, add it to the list of patterns.
      pattern.append(phrase)

  return pattern


In [None]:
df2.columns

In [None]:
rowlist = []   # Initialize an empty list rowlist to store the dictionaries for each row of the new DataFrame.

for i in range(len(df2)):   #to get the index values.
  year = df2.loc[i,'YEAR']
  sent = df2.loc[i,'SENTENCES']

  output = RULE_2(sent)  # Call the extract2 function on the current sentence (sent) to extract adjective-noun patterns and store in the output variable.

  dict1 = {'YEAR':year,'SENTENCES':sent , 'OUTPUT':output}  # Create a dictionary dict1 containing the 'YEAR', 'SENTENCES', and 'OUTPUT' values.
  rowlist.append(dict1)

df4 = pd.DataFrame(rowlist)   # create a new DataFrame df4 using pd.DataFrame(rowlist), which converts the list of dictionaries into a DataFrame.
df4.head()

In [None]:
#SELECTING NON-EMPTY OUTPUTS:

df_show = pd.DataFrame(columns = df4.columns)
for row in range(len(df4)):
  if (len(df4.loc[row,'OUTPUT'])!=0):        # checks if the length of the 'OUTPUT' column in the current row of df4 is not equal to zero
    df_show = df_show.append(df4.loc[row,:])   # If the condition is true, it means that the row has non-empty output, so the entire row is appended to the df_show DataFrame.

df_show.reset_index(inplace=True, drop=True)   # the index of the df_show DataFrame is reset, and the inplace parameter is set to True,
                                               # which means the operation is performed on the DataFrame itself rather than returning a new DataFrame.

In [None]:
df_show.shape
df_show.head()

# Combining Rule 1 and Rule 2

Sample sentences:

1. With support from many countries, **we** have **made** **Impressive progress**

2. **It** will **require** **military planners** the world over to recognize that training for peace operations is a legitimate part of every nation security strategy

In [None]:
#MODIFIYING RULE 2
def RULE_2_mod(text,index):
  doc = nlp(text)
  phrase = ""

  for token in doc:
    # For each word, the function checks if its position (token.i) matches the provided index.
    #If there is a match, it means we found the word we are looking for.
    #If there is no match, we move on to the next word in the document.
    if token.i == index:   

      #Once we find the target word, we look at its children (words that are connected to it in the sentence).
      for subtoken in token.children:

        #For each child word, we check if it is an adjective (subtoken.pos_ == 'ADJ') or if it has a compound relationship with the target word (subtoken.dep_ == 'compound').
        if (subtoken.pos_ == 'ADJ') or (subtoken.dep_=='compound'):

          #If the child word meets either of these conditions, we add its text to the phrase variable.
          phrase = phrase + subtoken.text

      # we stop the loop using the break statement since we have found what we needed.
      break

  # Finally, the function returns the phrase, which contains the adjectives that describe the target word.    
  return phrase

In [None]:
"""This code essentially looks for VERBS in the text and checks if they have subject nouns and direct object nouns associated with them. 
It then extracts any adjectives related to these nouns and forms a pattern using the adjective, noun, and verb lemma.  """

#USING RULE 3 AND RULE 4
def RULE_1_mod(text):
  doc= nlp(text)
  sent=[]  # create an empty list called sent to store our extracted patterns.

  for token in doc:

    #  check if the current token has a part-of-speech tag 'VERB'
    if (token.pos_=='VERB'):
      phrase=''   # initialize an empty string called phrase to store the pattern.

      # left children of the verb token using token.lefts -  tokens that appear before the verb in the sentence.
      for sub_tok in token.lefts:
        if (sub_tok.dep_ in ['nsubj','nsubjpass']) and (sub_tok.pos_ in ['NOUN','PROPN','PRON']):

          # if above conditions are satisfied - call the extract2_mod function to extract the adjective (if any) associated with the subject noun.
          # pass the text and the index of the subject noun (sub_tok.i) to the extract2_mod function.
          adj = RULE_2_mod(text,sub_tok.i)

          #  concatenate the extracted adjective, the subject noun, and a space to the phrase.
          phrase += adj + ' ' + sub_tok.text
          
          #  concatenate the lemma of the verb (token.lemma_) to the phrase. The lemma is the base or dictionary form of the verb.
          phrase += ' '+token.lemma_

          # Right children are the tokens that appear after the verb in the sentence.
          for sub_tok in token.rights:
            if(sub_tok.dep_ in ['dobj']) and (sub_tok.pos_ in ['NOUN','PROPN']):

              # if above conditions are satisfied - call the extract2_mod function to extract the adjective (if any) associated with the subject noun.
              # pass the text and the index of the subject noun (sub_tok.i) to the extract2_mod function.
              adj = RULE_2_mod(text,sub_tok.i)

              #  concatenate the extracted adjective, the subject noun, and a space to the phrase.
              phrase += adj + ' '+sub_tok.text

              # append the phrase to the sent list.
              sent.append(phrase)
  return sent

In [None]:
rowlist = []

for i in range(len(df2)):
  year = df2.loc[i,'YEAR']
  sent = df2.loc[i,'SENTENCES']

  output = RULE_1_mod(sent)

  dict1 = {'YEAR':year,'SENTENCES':sent , 'OUTPUT':output}
  rowlist.append(dict1)

df_rule = pd.DataFrame(rowlist)
df_rule.head()

In [None]:
#SELECTING NON-EMPTY OUTPUTS
df_show_mod = pd.DataFrame(columns = df_rule.columns)

for row in range(len(df_rule)):
  if len(df_rule.loc[row,'OUTPUT'])!=0:
    df_show_mod = df_show_mod.append(df_rule.loc[row,:])


#reset the index
df_show_mod.reset_index(inplace=True, drop=True)


In [None]:

df_show_mod.shape

In [None]:
df_show_mod.head()

In [None]:
print(df_show_mod.loc[4,'OUTPUT'])
print(df_show_mod.loc[4,'SENTENCES'])

In [None]:
print(df_show_mod.loc[12,'OUTPUT'])
print(df_show_mod.loc[12,'SENTENCES'])

# Rule 5: Patterns using prepositions

Sample sentences:

1. We all believe the **benefits of globalization** must be allocated more broadly within and among societies

2. As the Millennium Summit reflected, we have no **shortage of** worthy **goals**

3. It should preserve the special **responsibility for peacekeeping** of the Security Council permanent members

In [None]:
from spacy import displacy
displacy.render(nlp("We all believe the benefits of globalization must be allocated more broadly within and among societies"),style='dep',jupyter=True)


In [None]:
def RULE_3(text):
  doc = nlp(text)
  sent = []

  for token in doc:
    if token.pos_=='ADP':
      phrase = ''
      if token.head.pos_=='NOUN':
        phrase += token.head.text
        phrase += ' '+token.text

        for right_token in token.rights:
          if (right_token.pos_ in ['NOUN','PROPN']):
            phrase += ' '+right_token.text

        if (len(phrase)>2):
          sent.append(phrase)

  return sent



In [None]:
rowlist = []

for i in range(len(df2)):
  year = df2.loc[i,'YEAR']
  sent = df2.loc[i,'SENTENCES']

  output = RULE_3(sent)

  dict1 = {'YEAR':year,'SENTENCES':sent , 'OUTPUT':output}
  rowlist.append(dict1)

df_rule = pd.DataFrame(rowlist)
df_rule.head()

In [None]:
#SELECTING NON-EMPTY OUTPUTS:

df_show = pd.DataFrame(columns = df_rule.columns)

for row in range(len(df_rule)):
  if len(df_rule.loc[row,'OUTPUT'])!=0:
    df_show = df_show.append(df_rule.loc[row,:])


#reset the index
df_show.reset_index(inplace=True, drop=True)


In [None]:
df_show.head(10)

In [None]:
df_show.columns

In [None]:
#SEPARATE NOUN, PREPOSITION AND NOUN
prep_dict={}      # used to store the count of each preposition encountered
separated_components ={}   # used to store the separated components (noun1, prepo , noun2)
components = []   

for i in range(len(df_show)):
  # retrieve the values of these columns for the current row of df_show dataframe
  sent = df_show.loc[i,'SENTENCES']
  year = df_show.loc[i,'YEAR']
  output = df_show.loc[i,'OUTPUT']

  for sent in output:         #iterated over each sentence in output list

    n1 = sent.split()[0]
    p = sent.split()[1]
    n2 = sent.split()[2:] 

    separated_components = {'YEAR':year,'SENTENCES':sent,'NOUN-1':n1,'PREPOSITION':p,'NOUN-2':n2}

    components.append(separated_components)

    prep = sent.split()[1]
    if prep in prep_dict:
      prep_dict[prep]+=1
    else:
      prep_dict[prep]=1

df_sep2 = pd.DataFrame(components)


In [None]:
df_sep2.head(10)

In [None]:
df_sep2['PREPOSITION'].value_counts()[:10]

In [None]:
df_sep2[df_sep2['PREPOSITION']=='against'].head(10)