In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/train.csv")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
import pandas as pd

# Assuming you have already loaded the dataset into a DataFrame named 'data'

# Separate features and labels
X = data['question_text']
y = data['target']

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Transform text data into numerical features
X_tfidf = tfidf_vectorizer.fit_transform(X)

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Resample using SMOTE
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

# Convert the resampled data back to DataFrame
resampled_data = pd.DataFrame(X_resampled.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
resampled_data['target'] = y_resampled

# Filter resampled data to get only 200 samples for each class
resampled_data_200_samples = pd.concat([resampled_data[resampled_data['target'] == 0].head(1),
                                        resampled_data[resampled_data['target'] == 1].head(1)])

# Print the resampled data with 200 samples for each class
print(resampled_data_200_samples)


In [None]:
data.head()

In [None]:
data_zero_target = data[data['target'] == 0].head(10)
print(data_zero_target)


In [None]:
# 1. Loading the language library
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
data["question_text"][:100]
# print(data.iloc[0: 5]);
data["question_text"][0:5]

In [None]:
# 2. Building a Pipline Object
doc = nlp(data["question_text"].iloc[:3].to_string(index=False))



In [None]:
from tabulate import tabulate

token_info = []
for token in doc:
    token_info.append([token.text, token.pos_, token.dep_, token.lemma_])

headers = ["Token", "POS", "Dependency", "Lemma"]

print(tabulate(token_info, headers=headers, tablefmt="pretty"))


In [None]:
for entity in doc.ents:
    print(f"{entity.text:-<{20}}{entity.label_:-<{20}}{str(spacy.explain(entity.label_))}")

In [None]:
for chunk in doc.noun_chunks:
    print(chunk.text)

In [None]:
doc = nlp(data["question_text"].iloc[:50].to_string(index=False))
import spacy

print("{:<30} {:<15} {:<50}".format("Entity", "Label", "Description"))

for entity in doc.ents:
    print("{:<30} {:<15} {:<50}".format(entity.text, entity.label_, spacy.explain(entity.label_)))

In [None]:
from spacy import displacy
displacy.render(doc, style='dep', jupyter=True, options={'distance':90})

In [None]:
displacy.render(doc, style='ent', jupyter=True)

In [None]:
from tabulate import tabulate

# Initialize an empty list to store token information
token_info = []

# Iterate over tokens and append token information to the list
for token in doc:
    token_info.append([token.text, token.pos_, token.lemma_, token.lemma_])

# Define headers for the table
headers = ["Token", "POS", "Lemma ID", "Lemma"]

# Print the table using tabulate with borders
print(tabulate(token_info, headers=headers, tablefmt="pretty"))


In [None]:
# This shows the similarity between two questions
doc_1 = nlp(data["question_text"][1])
doc_2 = nlp(data["question_text"][2])

print(doc_1.similarity(doc_2))
print(doc_2.similarity(doc_1))

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def calculate_similarity_matrix(questions):
    num_questions = len(questions)
    similarity_matrix = np.zeros((num_questions, num_questions))
    for i in range(num_questions):
        for j in range(num_questions):
            similarity_matrix[i, j] = nlp(questions[i]).similarity(nlp(questions[j]))
    return similarity_matrix

questions = data["question_text"][:10]  

similarity_matrix = calculate_similarity_matrix(questions)

plt.figure(figsize=(10, 8))
sns.set(font_scale=1.2)
sns.heatmap(similarity_matrix, annot=True, cmap="YlGnBu", xticklabels=range(1, 11), yticklabels=range(1, 11))
plt.xlabel("Question Index")
plt.ylabel("Question Index")
plt.title("Similarity Matrix between Questions")
plt.show()


In [None]:
print(data["question_text"][3])
print(data["question_text"][7])

In [None]:

import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

for text in data["question_text"].iloc[:5]:
   
    doc = nlp(text)

    displacy.render(doc, style="dep", jupyter=True)

    displacy.render(doc, style="ent", jupyter=True)



**Working with POS Tags**

In [None]:
for text in data["question_text"].iloc[:5]:
    doc = nlp(text)
   
    token_info = []
    
    for token in doc:
        token_info.append([token.text, token.pos_, token.tag_, spacy.explain(token.tag_)])

    headers = ["Token", "POS", "Tag", "Explanation"]

    print(tabulate(token_info, headers=headers))

    print("-" * 100) 

In [None]:
import matplotlib.pyplot as plt
for text in data["question_text"].iloc[:5]:

    doc = nlp(text)
    
    pos_count = doc.count_by(spacy.attrs.POS)
    
    pos_count_text = {doc.vocab[key].text: value for key, value in pos_count.items()}
    plt.figure(figsize=(8, 6))
    plt.bar(pos_count_text.keys(), pos_count_text.values(), color='skyblue')
    plt.xlabel('POS Tag')
    plt.ylabel('Count')
    plt.title('POS Tag Counts')
    plt.xticks(rotation=45)  
    plt.tight_layout()
    plt.show()

In [None]:
# Initialize an empty dictionary to store POS tag counts
pos_count_total = {}

# Assuming 'data' is your DataFrame containing text data
for text in data["question_text"].iloc[:5]:
    # Process the text with spaCy
    doc = nlp(text)
    
    # Count POS tags for the current sentence
    pos_count = doc.count_by(spacy.attrs.POS)
    
    # Aggregate POS tag counts across all sentences
    for key, value in pos_count.items():
        pos_tag = doc.vocab[key].text
        pos_count_total[pos_tag] = pos_count_total.get(pos_tag, 0) + value

# Plot the combined POS tag counts using a bar chart
plt.figure(figsize=(8, 6))
plt.bar(pos_count_total.keys(), pos_count_total.values(), color='green')
plt.xlabel('POS Tag')
plt.ylabel('Count')
plt.title('Combined POS Tag Counts')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()

In [None]:
pos = []
lemma = []
text = []

for tok in doc:
    pos.append(tok.pos_)
    lemma.append(tok.lemma_)
    text.append(tok.text)

In [None]:
nlp_table =  pd.DataFrame({"Text": text , "Lemma": lemma, "PoS": pos})
nlp_table

In [None]:
noun = []

for review in data["question_text"].iloc[:10000]:
    doc = nlp(review)
    for tok in doc:
        if tok.pos_ == 'NOUN':
            noun.append(tok.lemma_.lower())
            

pd.Series(noun).value_counts().head()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming data["question_text"] contains your list of questions and nlp is your spaCy model
# Make sure you have imported the necessary libraries and initialized the spaCy model (nlp)

# Create empty dictionary to store POS tag counts
pos_counts = {}

# Iterate through each review in the dataset
for review in data["question_text"].iloc[:10000]:
    doc = nlp(review)
    # Iterate through each token in the review
    for token in doc:
        pos_tag = token.pos_
        # Check if POS tag is already in the dictionary, if not, add it with count 1
        if pos_tag not in pos_counts:
            pos_counts[pos_tag] = 1
        else:
            # If POS tag is already in the dictionary, increment its count by 1
            pos_counts[pos_tag] += 1

# Convert dictionary to pandas Series for easier manipulation
pos_counts_series = pd.Series(pos_counts)

# Sort the Series by counts in descending order
pos_counts_series_sorted = pos_counts_series.sort_values(ascending=False)

# Plot the distribution of POS tags
plt.figure(figsize=(10, 6))
pos_counts_series_sorted.plot(kind='bar')
plt.title('Distribution of POS Tags in Questions')
plt.xlabel('POS Tag')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()


In [None]:
#We now know that people mention people, way, year etc. But we still don't know in what context they mention these keywords

In [None]:
#Extract all the prefixes and suffixes of "people"

In [None]:
import re

pattern = re.compile(r"\b\w+\speople\s\w+\b")  

text = " ".join(data["question_text"].iloc[:1000].astype(str))

prefixes_suffixes = re.findall(pattern, text)




In [None]:
from tabulate import tabulate

occurrences_data = []

for idx, occurrence in enumerate(prefixes_suffixes):
    occurrences_data.append([idx + 1, occurrence])

print(tabulate(occurrences_data, headers=["Index", "Occurrence"]))



In [None]:
prefixes = []
suffixes = []
for p in prefixes_suffixes:
    l = p.split(" ")
    prefixes.append(l[0].lower())
    suffixes.append(l[-1].lower())

In [None]:
pd.Series(prefixes).value_counts().head(5)

In [None]:
pd.Series(suffixes).value_counts().head(5)

In [None]:
prefixes=pd.Series(prefixes).value_counts().head(5).index
suffixes=pd.Series(suffixes).value_counts().head(5).index

In [None]:
pd.DataFrame({'prefixes':prefixes,'keyword':['people']*len(prefixes),'suffixes':suffixes})

In [None]:
def get_context(reviews,keyword):
    pattern = re.compile(f"\w+\s{keyword}\s\w+")
    prefixes_suffixes = re.findall(pattern,reviews)
    prefixes = []
    suffixes = []
    for p in prefixes_suffixes:
        l = p.split(" ")
        prefixes.append(l[0].lower())
        suffixes.append(l[-1].lower())
    prefixes = [p for p in prefixes if p not in stop_words]
    suffixes = [s for s in suffixes if s not in stop_words]
    prefixes=pd.Series(prefixes).value_counts().head(5).index
    suffixes=pd.Series(suffixes).value_counts().head(5).index
    return pd.DataFrame({'prefixes':prefixes,'keyword':[f'{keyword}']*len(prefixes),'suffixes':suffixes})

In [None]:
# get_context(data["question_text"].iloc[:1,:1000].astype(str),"india")

In [None]:
import nltk
from nltk.corpus import stopwords

# Download the NLTK stop words dataset
nltk.download('stopwords')

# Get the set of English stop words
stop_words = set(stopwords.words('english'))

In [None]:
#How Dependency Parsing Works?

In [None]:

active = ['Hens lay eggs.',
         'Birds build nests.',
         'The batter hit the ball.',
         'The computer transmitted a copy of the manual']
passive = ['Eggs are laid by hens',
           'Nests are built by birds',
           'The ball was hit by the batter',
           'A copy of the manual was transmitted by the computer.']

In [None]:
for sent in active:
    doc = nlp(sent)
    displacy.render(doc, style = 'dep')

In [None]:
for sent in passive:
    doc = nlp(sent)
    displacy.render(doc, style = "dep")

In [None]:


# Summary:

#     Spacy's dependency parser let's us visualise the relationships
#     When a sentence is in passive voice there is always a presence if nsubjpass dependency relation

# #

In [None]:
#NER (Name- Entity- Recognisation)

In [None]:
import spacy
import warnings
warnings.filterwarnings('ignore')

In [None]:


model = spacy.load("en_core_web_sm") #load pre-trained model



# processed_doc = model(data); #process input and perform NLP tasks



In [None]:
doc = data["question_text"].iloc[:1000]

In [None]:
processed_docs = []

# Iterate through each text in the data and process it using the model
for text in doc:
    doc = model(text)
    processed_docs.append(doc)

In [None]:
doc.ents

In [None]:
for token in doc:
    
    print(token.text, '--', token.pos_)

In [None]:

grammar = {
    'S': [['S', '+', 'E'], ['E']],
    'E': [['num'], ['(', 'S', ')']]
}


def remove_left_recursion(grammar):
    non_terminals = list(grammar.keys())
    new_grammar = {}
    
    for A in non_terminals:
        productions = grammar[A]
        left_recursive = []
        right_recursive = []
        
        for production in productions:
            if production[0] == A:
                left_recursive.append(production[1:])
            else:
                right_recursive.append(production)
        
        if left_recursive:
            A_prime = A + "'"
            new_grammar[A] = []
            new_grammar[A_prime] = []
            
            for production in right_recursive:
                new_grammar[A].append(production + [A_prime])
                
            new_grammar[A_prime].append(['ε'])
            for production in left_recursive:
                new_grammar[A_prime].append(production + [A_prime])
        else:
            new_grammar[A] = productions
    
    return new_grammar

new_grammar = remove_left_recursion(grammar)
print("Grammar before left recursion removal:")
for key, value in grammar.items():
    print(f"{key} -> {' | '.join([' '.join(prod) for prod in value])}")
print("Grammar after left recursion removal:")
for key, value in new_grammar.items():
    print(f"{key} -> {' | '.join([' '.join(prod) for prod in value])}")


In [None]:
import nltk
from nltk import PCFG
from nltk.parse import pchart
from nltk.tokenize import word_tokenize

pcfg_grammar = PCFG.fromstring("""
    S -> NP VP [1.0]
    NP -> Det N [0.7] | NP PP [0.3]
    PP -> P NP [1.0]
    VP -> V NP [0.6] | VP PP [0.4]
    Det -> 'the' [0.8] | 'a' [0.2]
    N -> 'man' [0.5] | 'telescope' [0.5]
    V -> 'saw' [1.0]
    P -> 'with' [1.0]
""")

parser = pchart.InsideChartParser(pcfg_grammar)

sentence = "the man saw a telescope with the telescope"

tokens = word_tokenize(sentence)

for i, tree in enumerate(parser.parse(tokens)):
    print(tree)


In [None]:

nltk.download()
from nltk.corpus import wordnet

In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [None]:
import nltk
from nltk.corpus import wordnet

# Function to identify ambiguous words in a sentence
def identify_ambiguous_words(sentence):
    tokens = nltk.word_tokenize(sentence)
    ambiguous_words = []
    for token in tokens:
        synsets = wordnet.synsets(token)
        if len(synsets) > 1:  # If the word has more than one synset, it's ambiguous
            ambiguous_words.append(token)
    return ambiguous_words

# Function to disambiguate words based on context
def disambiguate_word(word):
    synsets = wordnet.synsets(word)
    # For simplicity, let's assume we select the first synset as the most common one
    if synsets:
        return synsets[0].definition()
    else:
        return None  # Return None if no synsets found

# Example sentence
sentence = "I saw a bat flying in the sky."

# Identify ambiguous words
ambiguous_words = identify_ambiguous_words(sentence)
print("Ambiguous words:")
for word in ambiguous_words:
    print(word)

# Disambiguate each ambiguous word and replace it in the sentence
disambiguated_sentence = sentence
for word in ambiguous_words:
    disambiguated_word = disambiguate_word(word)
    if disambiguated_word:
        disambiguated_sentence = disambiguated_sentence.replace(word, disambiguated_word)

print("\nDisambiguated sentence:")
print(disambiguated_sentence)


In [None]:
import nltk

# Example sentence
sentence = "The quick brown fox jumps over the lazy dog."

# Tokenize the sentence
tokens = nltk.word_tokenize(sentence)

# Perform part-of-speech tagging
tagged_tokens = nltk.pos_tag(tokens)

# Define a chunk grammar to extract noun phrases (NP)
chunk_grammar = r"""
    NP: {<DT|JJ|NN.*>+}          # Chunk sequences of determiner, adjective, noun
"""

# Create a chunk parser with the defined grammar
chunk_parser = nltk.RegexpParser(chunk_grammar)

# Apply chunking to the tagged tokens
chunks = chunk_parser.parse(tagged_tokens)

# Print the chunks
for chunk in chunks:
    if isinstance(chunk, nltk.tree.Tree) and chunk.label() == 'NP':
        print(" ".join([token[0] for token in chunk.leaves()]))


In [None]:
import nltk

def shallow_parse(sentence):
    tokens = nltk.word_tokenize(sentence)
    tagged_tokens = nltk.pos_tag(tokens)
    chunk_grammar = r"""
        NP: {<DT|JJ|NN.*>+}         
    """
    chunk_parser = nltk.RegexpParser(chunk_grammar)
    chunks = chunk_parser.parse(tagged_tokens)
    noun_phrases = []
    
    for chunk in chunks:
        if isinstance(chunk, nltk.tree.Tree) and chunk.label() == 'NP':
            noun_phrases.append(" ".join([token[0] for token in chunk.leaves()]))
    
    return noun_phrases

questions = [
    "How did Quebec nationalists see their province as a nation in 1960s?",
    "What province adopted the National Flag of Canada in 1965?",
    "Which province had a referendum to separate from Canada in 1980?",
    "What was the first province to join Confederation?",
    "Which province is the most populous?",
    "What is Canada's second most populous province?",
    "Which province is considered the cultural heart of English Canada?",
    "What province is known for its beautiful coastlines?"
]

for i, question in enumerate(questions, 1):
    print(f"Question {i}: {question}")
    noun_phrases = shallow_parse(question)
    print("Noun Phrases:")
    for np in noun_phrases:
        print("-", np)
    print()
