In [60]:
import nltk
import string
import spacy
from sacremoses import MosesTokenizer, MosesDetokenizer
from spacy import displacy
from collections import Counter
import truecase
import en_core_web_sm
nlp = en_core_web_sm.load()
from pprint import pprint
from unidecode import unidecode
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
import warnings
from tqdm.notebook import tqdm_notebook
from scipy.spatial import distance
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
import tensorflow as tf
import transformers
from sent2vec.vectorizer import Vectorizer as S2vectorizer
tqdm_notebook.pandas()
warnings.filterwarnings("ignore")

pd.set_option('display.max_colwidth', 0)
pd.set_option('display.max_columns', 0)

## NER Generation

In [41]:
pprint([(X.text, X.label_) for X in nlp('The political system of the Islamic Republic is based on the 1979 Constitution, and comprises several intricately connected governing bodies. The Leader of the Revolution ("Supreme Leader") is responsible for delineation and supervision of the general policies of the Islamic Republic of Iran. The Supreme Leader is Commander-in-Chief of the armed forces, controls the military intelligence and security operations, and has sole power to declare war or peace. The heads of the judiciary, state radio and television networks, the commanders of the police and military forces and six of the twelve members of the Guardian Council are appointed by the Supreme Leader. The Assembly of Experts elects and dismisses the Supreme Leader on the basis of qualifications and popular esteem.').ents])

[('the Islamic Republic', 'GPE'),
 ('1979', 'DATE'),
 ('Constitution', 'LAW'),
 ('the Islamic Republic of Iran', 'GPE'),
 ('six', 'CARDINAL'),
 ('twelve', 'CARDINAL'),
 ('the Guardian Council', 'ORG'),
 ('The Assembly of Experts', 'ORG')]


In [55]:
pprint([(X.text, X.label_) for X in nlp('german').ents])

[]


In [56]:
pprint([(X.text, X.label_) for X in nlp('German').ents])

[('German', 'NORP')]


In [13]:
def generate_ners(text):
    result = [unidecode(X.text.lower()) for X in nlp(text).ents]
    return result

In [None]:
df = pd.read_csv('../data/SQuAD_csv.csv', encoding='utf-8').loc[:, ['context']]
df = df.drop_duplicates(subset=['context'])
df['context_ner'] = df['context'].progress_apply(generate_ners)
# df['context'] = df['context'].apply(lambda x: str(x).lower())

In [None]:
df.head()

## Text Preprocessing

In [26]:
stop_words = stopwords.words("english")

def clean_normalcase_stop(text):
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\s{2,}', " ", text)
    text = unidecode(text)
    return text

def clean_normalcase_nostop(text):
    text = ' '.join([word for word in text.split() if word not in stop_words])
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\s{2,}', " ", text)
    text = unidecode(text)
    return text

def clean_lowercase_stop(text):
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\s{2,}', " ", text)
    text = unidecode(text)
    return text.lower()

def clean_lowercase_nostop(text):
    text = ' '.join([word for word in text.split() if word not in stop_words])
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\s{2,}', " ", text)
    text = unidecode(text)
    return text.lower()

In [34]:
df['cleaned_normalcase_stop'] = df['context'].progress_apply(clean_normalcase_stop)

HBox(children=(FloatProgress(value=0.0, max=18877.0), HTML(value='')))




In [35]:
df['cleaned_normalcase_nostop'] = df['context'].progress_apply(clean_normalcase_nostop)

HBox(children=(FloatProgress(value=0.0, max=18877.0), HTML(value='')))




In [36]:
df['cleaned_lowercase_stop'] = df['context'].progress_apply(clean_lowercase_stop)

HBox(children=(FloatProgress(value=0.0, max=18877.0), HTML(value='')))




In [37]:
df['cleaned_lowercase_nostop'] = df['context'].progress_apply(clean_lowercase_nostop)

HBox(children=(FloatProgress(value=0.0, max=18877.0), HTML(value='')))




In [40]:
df['ner_combined'] = df['context_ner'].apply(lambda x: ' '.join(x).strip('[').strip(']').replace(',',''))

In [41]:
df.head(10)

Unnamed: 0,context,context_ner,cleaned_normalcase_stop,cleaned_normalcase_nostop,cleaned_lowercase_stop,cleaned_lowercase_nostop,ner_combined
0,"Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles ""Crazy in Love"" and ""Baby Boy"".","[beyonce giselle knowles-carter, september 4, 1981, american, houston, texas, the late 1990s, r&b, destiny, mathew knowles, beyonce, 2003, five, 100, one, crazy in love, baby boy]",Beyonce Giselle Knowles Carter bi:'janseI bee YON say born September 4 1981 is an American singer songwriter record producer and actress Born and raised in Houston Texas she performed in various singing and dancing competitions as a child and rose to fame in the late 1990s as lead singer of R B girl group Destiny s Child Managed by her father Mathew Knowles the group became one of the world s best selling girl groups of all time Their hiatus saw the release of Beyonce s debut album Dangerously in Love 2003 which established her as a solo artist worldwide earned five Grammy Awards and featured the Billboard Hot 100 number one singles Crazy in Love and Baby Boy,Beyonce Giselle Knowles Carter bi:'janseI bee YON say born September 4 1981 American singer songwriter record producer actress Born raised Houston Texas performed various singing dancing competitions child rose fame late 1990s lead singer R B girl group Destiny s Child Managed father Mathew Knowles group became one world s best selling girl groups time Their hiatus saw release Beyonce s debut album Dangerously Love 2003 established solo artist worldwide earned five Grammy Awards featured Billboard Hot 100 number one singles Crazy Love Baby Boy,beyonce giselle knowles carter bi:'jansei bee yon say born september 4 1981 is an american singer songwriter record producer and actress born and raised in houston texas she performed in various singing and dancing competitions as a child and rose to fame in the late 1990s as lead singer of r b girl group destiny s child managed by her father mathew knowles the group became one of the world s best selling girl groups of all time their hiatus saw the release of beyonce s debut album dangerously in love 2003 which established her as a solo artist worldwide earned five grammy awards and featured the billboard hot 100 number one singles crazy in love and baby boy,beyonce giselle knowles carter bi:'jansei bee yon say born september 4 1981 american singer songwriter record producer actress born raised houston texas performed various singing dancing competitions child rose fame late 1990s lead singer r b girl group destiny s child managed father mathew knowles group became one world s best selling girl groups time their hiatus saw release beyonce s debut album dangerously love 2003 established solo artist worldwide earned five grammy awards featured billboard hot 100 number one singles crazy love baby boy,beyonce giselle knowles-carter september 4 1981 american houston texas the late 1990s r&b destiny mathew knowles beyonce 2003 five 100 one crazy in love baby boy
15,"Following the disbandment of Destiny's Child in June 2005, she released her second solo album, B'Day (2006), which contained hits ""Déjà Vu"", ""Irreplaceable"", and ""Beautiful Liar"". Beyoncé also ventured into acting, with a Golden Globe-nominated performance in Dreamgirls (2006), and starring roles in The Pink Panther (2006) and Obsessed (2009). Her marriage to rapper Jay Z and portrayal of Etta James in Cadillac Records (2008) influenced her third album, I Am... Sasha Fierce (2008), which saw the birth of her alter-ego Sasha Fierce and earned a record-setting six Grammy Awards in 2010, including Song of the Year for ""Single Ladies (Put a Ring on It)"". Beyoncé took a hiatus from music in 2010 and took over management of her career; her fourth album 4 (2011) was subsequently mellower in tone, exploring 1970s funk, 1980s pop, and 1990s soul. Her critically acclaimed fifth studio album, Beyoncé (2013), was distinguished from previous releases by its experimental production and exploration of darker themes.","[june 2005, second, b'day, 2006, ""deja vu"", irreplaceable, beautiful liar, beyonce, golden globe, dreamgirls, 2006, the pink panther, obsessed, 2009, jay z, etta james, cadillac records, 2008, third, sasha fierce, 2008, sasha fierce, six, grammy awards, 2010, song, the year, single ladies, beyonce, 2010, fourth, 2011, 1970s, 1980s, 1990s, fifth, beyonce, 2013]",Following the disbandment of Destiny s Child in June 2005 she released her second solo album B Day 2006 which contained hits Deja Vu Irreplaceable and Beautiful Liar Beyonce also ventured into acting with a Golden Globe nominated performance in Dreamgirls 2006 and starring roles in The Pink Panther 2006 and Obsessed 2009 Her marriage to rapper Jay Z and portrayal of Etta James in Cadillac Records 2008 influenced her third album I Am Sasha Fierce 2008 which saw the birth of her alter ego Sasha Fierce and earned a record setting six Grammy Awards in 2010 including Song of the Year for Single Ladies Put a Ring on It Beyonce took a hiatus from music in 2010 and took over management of her career her fourth album 4 2011 was subsequently mellower in tone exploring 1970s funk 1980s pop and 1990s soul Her critically acclaimed fifth studio album Beyonce 2013 was distinguished from previous releases by its experimental production and exploration of darker themes,Following disbandment Destiny s Child June 2005 released second solo album B Day 2006 contained hits Deja Vu Irreplaceable Beautiful Liar Beyonce also ventured acting Golden Globe nominated performance Dreamgirls 2006 starring roles The Pink Panther 2006 Obsessed 2009 Her marriage rapper Jay Z portrayal Etta James Cadillac Records 2008 influenced third album I Am Sasha Fierce 2008 saw birth alter ego Sasha Fierce earned record setting six Grammy Awards 2010 including Song Year Single Ladies Put Ring It Beyonce took hiatus music 2010 took management career fourth album 4 2011 subsequently mellower tone exploring 1970s funk 1980s pop 1990s soul Her critically acclaimed fifth studio album Beyonce 2013 distinguished previous releases experimental production exploration darker themes,following the disbandment of destiny s child in june 2005 she released her second solo album b day 2006 which contained hits deja vu irreplaceable and beautiful liar beyonce also ventured into acting with a golden globe nominated performance in dreamgirls 2006 and starring roles in the pink panther 2006 and obsessed 2009 her marriage to rapper jay z and portrayal of etta james in cadillac records 2008 influenced her third album i am sasha fierce 2008 which saw the birth of her alter ego sasha fierce and earned a record setting six grammy awards in 2010 including song of the year for single ladies put a ring on it beyonce took a hiatus from music in 2010 and took over management of her career her fourth album 4 2011 was subsequently mellower in tone exploring 1970s funk 1980s pop and 1990s soul her critically acclaimed fifth studio album beyonce 2013 was distinguished from previous releases by its experimental production and exploration of darker themes,following disbandment destiny s child june 2005 released second solo album b day 2006 contained hits deja vu irreplaceable beautiful liar beyonce also ventured acting golden globe nominated performance dreamgirls 2006 starring roles the pink panther 2006 obsessed 2009 her marriage rapper jay z portrayal etta james cadillac records 2008 influenced third album i am sasha fierce 2008 saw birth alter ego sasha fierce earned record setting six grammy awards 2010 including song year single ladies put ring it beyonce took hiatus music 2010 took management career fourth album 4 2011 subsequently mellower tone exploring 1970s funk 1980s pop 1990s soul her critically acclaimed fifth studio album beyonce 2013 distinguished previous releases experimental production exploration darker themes,"june 2005 second b'day 2006 ""deja vu"" irreplaceable beautiful liar beyonce golden globe dreamgirls 2006 the pink panther obsessed 2009 jay z etta james cadillac records 2008 third sasha fierce 2008 sasha fierce six grammy awards 2010 song the year single ladies beyonce 2010 fourth 2011 1970s 1980s 1990s fifth beyonce 2013"
27,"A self-described ""modern-day feminist"", Beyoncé creates songs that are often characterized by themes of love, relationships, and monogamy, as well as female sexuality and empowerment. On stage, her dynamic, highly choreographed performances have led to critics hailing her as one of the best entertainers in contemporary popular music. Throughout a career spanning 19 years, she has sold over 118 million records as a solo artist, and a further 60 million with Destiny's Child, making her one of the best-selling music artists of all time. She has won 20 Grammy Awards and is the most nominated woman in the award's history. The Recording Industry Association of America recognized her as the Top Certified Artist in America during the 2000s decade. In 2009, Billboard named her the Top Radio Songs Artist of the Decade, the Top Female Artist of the 2000s and their Artist of the Millennium in 2011. Time listed her among the 100 most influential people in the world in 2013 and 2014. Forbes magazine also listed her as the most powerful female musician of 2015.","[modern-day, beyonce, 19 years, over 118 million, 60 million, destiny, 20, grammy awards, the recording industry association of america, the top certified artist, america, the 2000s decade, 2009, billboard, the top female artist, the 2000s, their artist of the millennium, 2011, time, 100, 2013, 2014, forbes, 2015]",A self described modern day feminist Beyonce creates songs that are often characterized by themes of love relationships and monogamy as well as female sexuality and empowerment On stage her dynamic highly choreographed performances have led to critics hailing her as one of the best entertainers in contemporary popular music Throughout a career spanning 19 years she has sold over 118 million records as a solo artist and a further 60 million with Destiny s Child making her one of the best selling music artists of all time She has won 20 Grammy Awards and is the most nominated woman in the award s history The Recording Industry Association of America recognized her as the Top Certified Artist in America during the 2000s decade In 2009 Billboard named her the Top Radio Songs Artist of the Decade the Top Female Artist of the 2000s and their Artist of the Millennium in 2011 Time listed her among the 100 most influential people in the world in 2013 and 2014 Forbes magazine also listed her as the most powerful female musician of 2015,A self described modern day feminist Beyonce creates songs often characterized themes love relationships monogamy well female sexuality empowerment On stage dynamic highly choreographed performances led critics hailing one best entertainers contemporary popular music Throughout career spanning 19 years sold 118 million records solo artist 60 million Destiny s Child making one best selling music artists time She 20 Grammy Awards nominated woman award s history The Recording Industry Association America recognized Top Certified Artist America 2000s decade In 2009 Billboard named Top Radio Songs Artist Decade Top Female Artist 2000s Artist Millennium 2011 Time listed among 100 influential people world 2013 2014 Forbes magazine also listed powerful female musician 2015,a self described modern day feminist beyonce creates songs that are often characterized by themes of love relationships and monogamy as well as female sexuality and empowerment on stage her dynamic highly choreographed performances have led to critics hailing her as one of the best entertainers in contemporary popular music throughout a career spanning 19 years she has sold over 118 million records as a solo artist and a further 60 million with destiny s child making her one of the best selling music artists of all time she has won 20 grammy awards and is the most nominated woman in the award s history the recording industry association of america recognized her as the top certified artist in america during the 2000s decade in 2009 billboard named her the top radio songs artist of the decade the top female artist of the 2000s and their artist of the millennium in 2011 time listed her among the 100 most influential people in the world in 2013 and 2014 forbes magazine also listed her as the most powerful female musician of 2015,a self described modern day feminist beyonce creates songs often characterized themes love relationships monogamy well female sexuality empowerment on stage dynamic highly choreographed performances led critics hailing one best entertainers contemporary popular music throughout career spanning 19 years sold 118 million records solo artist 60 million destiny s child making one best selling music artists time she 20 grammy awards nominated woman award s history the recording industry association america recognized top certified artist america 2000s decade in 2009 billboard named top radio songs artist decade top female artist 2000s artist millennium 2011 time listed among 100 influential people world 2013 2014 forbes magazine also listed powerful female musician 2015,modern-day beyonce 19 years over 118 million 60 million destiny 20 grammy awards the recording industry association of america the top certified artist america the 2000s decade 2009 billboard the top female artist the 2000s their artist of the millennium 2011 time 100 2013 2014 forbes 2015
39,"Beyoncé Giselle Knowles was born in Houston, Texas, to Celestine Ann ""Tina"" Knowles (née Beyincé), a hairdresser and salon owner, and Mathew Knowles, a Xerox sales manager. Beyoncé's name is a tribute to her mother's maiden name. Beyoncé's younger sister Solange is also a singer and a former member of Destiny's Child. Mathew is African-American, while Tina is of Louisiana Creole descent (with African, Native American, French, Cajun, and distant Irish and Spanish ancestry). Through her mother, Beyoncé is a descendant of Acadian leader Joseph Broussard. She was raised in a Methodist household.","[beyonce giselle knowles, houston, texas, celestine ann ""tina, knowles, mathew knowles, xerox, beyonce, beyonce, solange, destiny, child, mathew, african-american, tina, louisiana creole, african, native american, french, cajun, irish, spanish, beyonce, acadian, joseph broussard, methodist]",Beyonce Giselle Knowles was born in Houston Texas to Celestine Ann Tina Knowles nee Beyince a hairdresser and salon owner and Mathew Knowles a Xerox sales manager Beyonce s name is a tribute to her mother s maiden name Beyonce s younger sister Solange is also a singer and a former member of Destiny s Child Mathew is African American while Tina is of Louisiana Creole descent with African Native American French Cajun and distant Irish and Spanish ancestry Through her mother Beyonce is a descendant of Acadian leader Joseph Broussard She was raised in a Methodist household,Beyonce Giselle Knowles born Houston Texas Celestine Ann Tina Knowles nee Beyince hairdresser salon owner Mathew Knowles Xerox sales manager Beyonce s name tribute mother s maiden name Beyonce s younger sister Solange also singer former member Destiny s Child Mathew African American Tina Louisiana Creole descent with African Native American French Cajun distant Irish Spanish ancestry Through mother Beyonce descendant Acadian leader Joseph Broussard She raised Methodist household,beyonce giselle knowles was born in houston texas to celestine ann tina knowles nee beyince a hairdresser and salon owner and mathew knowles a xerox sales manager beyonce s name is a tribute to her mother s maiden name beyonce s younger sister solange is also a singer and a former member of destiny s child mathew is african american while tina is of louisiana creole descent with african native american french cajun and distant irish and spanish ancestry through her mother beyonce is a descendant of acadian leader joseph broussard she was raised in a methodist household,beyonce giselle knowles born houston texas celestine ann tina knowles nee beyince hairdresser salon owner mathew knowles xerox sales manager beyonce s name tribute mother s maiden name beyonce s younger sister solange also singer former member destiny s child mathew african american tina louisiana creole descent with african native american french cajun distant irish spanish ancestry through mother beyonce descendant acadian leader joseph broussard she raised methodist household,"beyonce giselle knowles houston texas celestine ann ""tina knowles mathew knowles xerox beyonce beyonce solange destiny child mathew african-american tina louisiana creole african native american french cajun irish spanish beyonce acadian joseph broussard methodist"
52,"Beyoncé attended St. Mary's Elementary School in Fredericksburg, Texas, where she enrolled in dance classes. Her singing talent was discovered when dance instructor Darlette Johnson began humming a song and she finished it, able to hit the high-pitched notes. Beyoncé's interest in music and performing continued after winning a school talent show at age seven, singing John Lennon's ""Imagine"" to beat 15/16-year-olds. In fall of 1990, Beyoncé enrolled in Parker Elementary School, a music magnet school in Houston, where she would perform with the school's choir. She also attended the High School for the Performing and Visual Arts and later Alief Elsik High School. Beyoncé was also a member of the choir at St. John's United Methodist Church as a soloist for two years.","[st. mary's, fredericksburg, texas, darlette johnson, beyonce, age seven, john lennon's, imagine, 15/16-year-olds, fall of 1990, beyonce, parker elementary school, houston, the high school for the performing and visual arts, elsik high school, beyonce, st. john's, united methodist church, two years]",Beyonce attended St Mary s Elementary School in Fredericksburg Texas where she enrolled in dance classes Her singing talent was discovered when dance instructor Darlette Johnson began humming a song and she finished it able to hit the high pitched notes Beyonce s interest in music and performing continued after winning a school talent show at age seven singing John Lennon s Imagine to beat 15 16 year olds In fall of 1990 Beyonce enrolled in Parker Elementary School a music magnet school in Houston where she would perform with the school s choir She also attended the High School for the Performing and Visual Arts and later Alief Elsik High School Beyonce was also a member of the choir at St John s United Methodist Church as a soloist for two years,Beyonce attended St Mary s Elementary School Fredericksburg Texas enrolled dance classes Her singing talent discovered dance instructor Darlette Johnson began humming song finished it able hit high pitched notes Beyonce s interest music performing continued winning school talent show age seven singing John Lennon s Imagine beat 15 16 year olds In fall 1990 Beyonce enrolled Parker Elementary School music magnet school Houston would perform school s choir She also attended High School Performing Visual Arts later Alief Elsik High School Beyonce also member choir St John s United Methodist Church soloist two years,beyonce attended st mary s elementary school in fredericksburg texas where she enrolled in dance classes her singing talent was discovered when dance instructor darlette johnson began humming a song and she finished it able to hit the high pitched notes beyonce s interest in music and performing continued after winning a school talent show at age seven singing john lennon s imagine to beat 15 16 year olds in fall of 1990 beyonce enrolled in parker elementary school a music magnet school in houston where she would perform with the school s choir she also attended the high school for the performing and visual arts and later alief elsik high school beyonce was also a member of the choir at st john s united methodist church as a soloist for two years,beyonce attended st mary s elementary school fredericksburg texas enrolled dance classes her singing talent discovered dance instructor darlette johnson began humming song finished it able hit high pitched notes beyonce s interest music performing continued winning school talent show age seven singing john lennon s imagine beat 15 16 year olds in fall 1990 beyonce enrolled parker elementary school music magnet school houston would perform school s choir she also attended high school performing visual arts later alief elsik high school beyonce also member choir st john s united methodist church soloist two years,st. mary's fredericksburg texas darlette johnson beyonce age seven john lennon's imagine 15/16-year-olds fall of 1990 beyonce parker elementary school houston the high school for the performing and visual arts elsik high school beyonce st. john's united methodist church two years
63,"At age eight, Beyoncé and childhood friend Kelly Rowland met LaTavia Roberson while in an audition for an all-girl entertainment group. They were placed into a group with three other girls as Girl's Tyme, and rapped and danced on the talent show circuit in Houston. After seeing the group, R&B producer Arne Frager brought them to his Northern California studio and placed them in Star Search, the largest talent show on national TV at the time. Girl's Tyme failed to win, and Beyoncé later said the song they performed was not good. In 1995 Beyoncé's father resigned from his job to manage the group. The move reduced Beyoncé's family's income by half, and her parents were forced to move into separated apartments. Mathew cut the original line-up to four and the group continued performing as an opening act for other established R&B girl groups. The girls auditioned before record labels and were finally signed to Elektra Records, moving to Atlanta Records briefly to work on their first recording, only to be cut by the company. This put further strain on the family, and Beyoncé's parents separated. On October 5, 1995, Dwayne Wiggins's Grass Roots Entertainment signed the group. In 1996, the girls began recording their debut album under an agreement with Sony Music, the Knowles family reunited, and shortly after, the group got a contract with Columbia Records.","[age eight, beyonce, kelly rowland, latavia roberson, three, girl's tyme, houston, r&b, arne frager, northern california, star search, tyme, beyonce, 1995, beyonce, beyonce, half, mathew, up to four, r&b, elektra records, atlanta records, first, beyonce, october 5, 1995, dwayne wiggins's, grass roots entertainment, 1996, sony music, knowles, columbia records]",At age eight Beyonce and childhood friend Kelly Rowland met LaTavia Roberson while in an audition for an all girl entertainment group They were placed into a group with three other girls as Girl s Tyme and rapped and danced on the talent show circuit in Houston After seeing the group R B producer Arne Frager brought them to his Northern California studio and placed them in Star Search the largest talent show on national TV at the time Girl s Tyme failed to win and Beyonce later said the song they performed was not good In 1995 Beyonce s father resigned from his job to manage the group The move reduced Beyonce s family s income by half and her parents were forced to move into separated apartments Mathew cut the original line up to four and the group continued performing as an opening act for other established R B girl groups The girls auditioned before record labels and were finally signed to Elektra Records moving to Atlanta Records briefly to work on their first recording only to be cut by the company This put further strain on the family and Beyonce s parents separated On October 5 1995 Dwayne Wiggins s Grass Roots Entertainment signed the group In 1996 the girls began recording their debut album under an agreement with Sony Music the Knowles family reunited and shortly after the group got a contract with Columbia Records,At age eight Beyonce childhood friend Kelly Rowland met LaTavia Roberson audition all girl entertainment group They placed group three girls Girl s Tyme rapped danced talent show circuit Houston After seeing group R B producer Arne Frager brought Northern California studio placed Star Search largest talent show national TV time Girl s Tyme failed win Beyonce later said song performed good In 1995 Beyonce s father resigned job manage group The move reduced Beyonce s family s income half parents forced move separated apartments Mathew cut original line up four group continued performing opening act established R B girl groups The girls auditioned record labels finally signed Elektra Records moving Atlanta Records briefly work first recording cut company This put strain family Beyonce s parents separated On October 5 1995 Dwayne Wiggins s Grass Roots Entertainment signed group In 1996 girls began recording debut album agreement Sony Music Knowles family reunited shortly after group got contract Columbia Records,at age eight beyonce and childhood friend kelly rowland met latavia roberson while in an audition for an all girl entertainment group they were placed into a group with three other girls as girl s tyme and rapped and danced on the talent show circuit in houston after seeing the group r b producer arne frager brought them to his northern california studio and placed them in star search the largest talent show on national tv at the time girl s tyme failed to win and beyonce later said the song they performed was not good in 1995 beyonce s father resigned from his job to manage the group the move reduced beyonce s family s income by half and her parents were forced to move into separated apartments mathew cut the original line up to four and the group continued performing as an opening act for other established r b girl groups the girls auditioned before record labels and were finally signed to elektra records moving to atlanta records briefly to work on their first recording only to be cut by the company this put further strain on the family and beyonce s parents separated on october 5 1995 dwayne wiggins s grass roots entertainment signed the group in 1996 the girls began recording their debut album under an agreement with sony music the knowles family reunited and shortly after the group got a contract with columbia records,at age eight beyonce childhood friend kelly rowland met latavia roberson audition all girl entertainment group they placed group three girls girl s tyme rapped danced talent show circuit houston after seeing group r b producer arne frager brought northern california studio placed star search largest talent show national tv time girl s tyme failed win beyonce later said song performed good in 1995 beyonce s father resigned job manage group the move reduced beyonce s family s income half parents forced move separated apartments mathew cut original line up four group continued performing opening act established r b girl groups the girls auditioned record labels finally signed elektra records moving atlanta records briefly work first recording cut company this put strain family beyonce s parents separated on october 5 1995 dwayne wiggins s grass roots entertainment signed group in 1996 girls began recording debut album agreement sony music knowles family reunited shortly after group got contract columbia records,age eight beyonce kelly rowland latavia roberson three girl's tyme houston r&b arne frager northern california star search tyme beyonce 1995 beyonce beyonce half mathew up to four r&b elektra records atlanta records first beyonce october 5 1995 dwayne wiggins's grass roots entertainment 1996 sony music knowles columbia records
76,"The group changed their name to Destiny's Child in 1996, based upon a passage in the Book of Isaiah. In 1997, Destiny's Child released their major label debut song ""Killing Time"" on the soundtrack to the 1997 film, Men in Black. The following year, the group released their self-titled debut album, scoring their first major hit ""No, No, No"". The album established the group as a viable act in the music industry, with moderate sales and winning the group three Soul Train Lady of Soul Awards for Best R&B/Soul Album of the Year, Best R&B/Soul or Rap New Artist, and Best R&B/Soul Single for ""No, No, No"". The group released their multi-platinum second album The Writing's on the Wall in 1999. The record features some of the group's most widely known songs such as ""Bills, Bills, Bills"", the group's first number-one single, ""Jumpin' Jumpin'"" and ""Say My Name"", which became their most successful song at the time, and would remain one of their signature songs. ""Say My Name"" won the Best R&B Performance by a Duo or Group with Vocals and the Best R&B Song at the 43rd Annual Grammy Awards. The Writing's on the Wall sold more than eight million copies worldwide. During this time, Beyoncé recorded a duet with Marc Nelson, an original member of Boyz II Men, on the song ""After All Is Said and Done"" for the soundtrack to the 1999 film, The Best Man.","[destiny, 1996, the book of isaiah, 1997, destiny, child, killing time, 1997, men in black, the following year, first, three, soul train lady of soul awards for best r&b/soul album of the year, rap new artist, soul single, second, the writing's on the wall in 1999, bills, bills, bills, first, one, jumpin' jumpin', say my name, say my name, the best r&b performance by a duo or group with vocals, 43rd, annual grammy awards, the writing's, wall, more than eight million, beyonce, marc nelson, boyz ii men, after all is said and done, 1999, the best man]",The group changed their name to Destiny s Child in 1996 based upon a passage in the Book of Isaiah In 1997 Destiny s Child released their major label debut song Killing Time on the soundtrack to the 1997 film Men in Black The following year the group released their self titled debut album scoring their first major hit No No No The album established the group as a viable act in the music industry with moderate sales and winning the group three Soul Train Lady of Soul Awards for Best R B Soul Album of the Year Best R B Soul or Rap New Artist and Best R B Soul Single for No No No The group released their multi platinum second album The Writing s on the Wall in 1999 The record features some of the group s most widely known songs such as Bills Bills Bills the group s first number one single Jumpin Jumpin and Say My Name which became their most successful song at the time and would remain one of their signature songs Say My Name won the Best R B Performance by a Duo or Group with Vocals and the Best R B Song at the 43rd Annual Grammy Awards The Writing s on the Wall sold more than eight million copies worldwide During this time Beyonce recorded a duet with Marc Nelson an original member of Boyz II Men on the song After All Is Said and Done for the soundtrack to the 1999 film The Best Man,The group changed name Destiny s Child 1996 based upon passage Book Isaiah In 1997 Destiny s Child released major label debut song Killing Time soundtrack 1997 film Men Black The following year group released self titled debut album scoring first major hit No No No The album established group viable act music industry moderate sales winning group three Soul Train Lady Soul Awards Best R B Soul Album Year Best R B Soul Rap New Artist Best R B Soul Single No No No The group released multi platinum second album The Writing s Wall 1999 The record features group s widely known songs Bills Bills Bills group s first number one single Jumpin Jumpin Say My Name became successful song time would remain one signature songs Say My Name Best R B Performance Duo Group Vocals Best R B Song 43rd Annual Grammy Awards The Writing s Wall sold eight million copies worldwide During time Beyonce recorded duet Marc Nelson original member Boyz II Men song After All Is Said Done soundtrack 1999 film The Best Man,the group changed their name to destiny s child in 1996 based upon a passage in the book of isaiah in 1997 destiny s child released their major label debut song killing time on the soundtrack to the 1997 film men in black the following year the group released their self titled debut album scoring their first major hit no no no the album established the group as a viable act in the music industry with moderate sales and winning the group three soul train lady of soul awards for best r b soul album of the year best r b soul or rap new artist and best r b soul single for no no no the group released their multi platinum second album the writing s on the wall in 1999 the record features some of the group s most widely known songs such as bills bills bills the group s first number one single jumpin jumpin and say my name which became their most successful song at the time and would remain one of their signature songs say my name won the best r b performance by a duo or group with vocals and the best r b song at the 43rd annual grammy awards the writing s on the wall sold more than eight million copies worldwide during this time beyonce recorded a duet with marc nelson an original member of boyz ii men on the song after all is said and done for the soundtrack to the 1999 film the best man,the group changed name destiny s child 1996 based upon passage book isaiah in 1997 destiny s child released major label debut song killing time soundtrack 1997 film men black the following year group released self titled debut album scoring first major hit no no no the album established group viable act music industry moderate sales winning group three soul train lady soul awards best r b soul album year best r b soul rap new artist best r b soul single no no no the group released multi platinum second album the writing s wall 1999 the record features group s widely known songs bills bills bills group s first number one single jumpin jumpin say my name became successful song time would remain one signature songs say my name best r b performance duo group vocals best r b song 43rd annual grammy awards the writing s wall sold eight million copies worldwide during time beyonce recorded duet marc nelson original member boyz ii men song after all is said done soundtrack 1999 film the best man,destiny 1996 the book of isaiah 1997 destiny child killing time 1997 men in black the following year first three soul train lady of soul awards for best r&b/soul album of the year rap new artist soul single second the writing's on the wall in 1999 bills bills bills first one jumpin' jumpin' say my name say my name the best r&b performance by a duo or group with vocals 43rd annual grammy awards the writing's wall more than eight million beyonce marc nelson boyz ii men after all is said and done 1999 the best man
89,"LeToya Luckett and Roberson became unhappy with Mathew's managing of the band and eventually were replaced by Farrah Franklin and Michelle Williams. Beyoncé experienced depression following the split with Luckett and Roberson after being publicly blamed by the media, critics, and blogs for its cause. Her long-standing boyfriend left her at this time. The depression was so severe it lasted for a couple of years, during which she occasionally kept herself in her bedroom for days and refused to eat anything. Beyoncé stated that she struggled to speak about her depression because Destiny's Child had just won their first Grammy Award and she feared no one would take her seriously. Beyoncé would later speak of her mother as the person who helped her fight it. Franklin was dismissed, leaving just Beyoncé, Rowland, and Williams.","[letoya luckett, roberson, mathew, farrah franklin, michelle williams, beyonce, luckett, roberson, a couple of years, days, beyonce, destiny, child, first, grammy award, beyonce, franklin, beyonce, rowland, williams]",LeToya Luckett and Roberson became unhappy with Mathew s managing of the band and eventually were replaced by Farrah Franklin and Michelle Williams Beyonce experienced depression following the split with Luckett and Roberson after being publicly blamed by the media critics and blogs for its cause Her long standing boyfriend left her at this time The depression was so severe it lasted for a couple of years during which she occasionally kept herself in her bedroom for days and refused to eat anything Beyonce stated that she struggled to speak about her depression because Destiny s Child had just won their first Grammy Award and she feared no one would take her seriously Beyonce would later speak of her mother as the person who helped her fight it Franklin was dismissed leaving just Beyonce Rowland and Williams,LeToya Luckett Roberson became unhappy Mathew s managing band eventually replaced Farrah Franklin Michelle Williams Beyonce experienced depression following split Luckett Roberson publicly blamed media critics blogs cause Her long standing boyfriend left time The depression severe lasted couple years occasionally kept bedroom days refused eat anything Beyonce stated struggled speak depression Destiny s Child first Grammy Award feared one would take seriously Beyonce would later speak mother person helped fight it Franklin dismissed leaving Beyonce Rowland Williams,letoya luckett and roberson became unhappy with mathew s managing of the band and eventually were replaced by farrah franklin and michelle williams beyonce experienced depression following the split with luckett and roberson after being publicly blamed by the media critics and blogs for its cause her long standing boyfriend left her at this time the depression was so severe it lasted for a couple of years during which she occasionally kept herself in her bedroom for days and refused to eat anything beyonce stated that she struggled to speak about her depression because destiny s child had just won their first grammy award and she feared no one would take her seriously beyonce would later speak of her mother as the person who helped her fight it franklin was dismissed leaving just beyonce rowland and williams,letoya luckett roberson became unhappy mathew s managing band eventually replaced farrah franklin michelle williams beyonce experienced depression following split luckett roberson publicly blamed media critics blogs cause her long standing boyfriend left time the depression severe lasted couple years occasionally kept bedroom days refused eat anything beyonce stated struggled speak depression destiny s child first grammy award feared one would take seriously beyonce would later speak mother person helped fight it franklin dismissed leaving beyonce rowland williams,letoya luckett roberson mathew farrah franklin michelle williams beyonce luckett roberson a couple of years days beyonce destiny child first grammy award beyonce franklin beyonce rowland williams
99,"The remaining band members recorded ""Independent Women Part I"", which appeared on the soundtrack to the 2000 film, Charlie's Angels. It became their best-charting single, topping the U.S. Billboard Hot 100 chart for eleven consecutive weeks. In early 2001, while Destiny's Child was completing their third album, Beyoncé landed a major role in the MTV made-for-television film, Carmen: A Hip Hopera, starring alongside American actor Mekhi Phifer. Set in Philadelphia, the film is a modern interpretation of the 19th century opera Carmen by French composer Georges Bizet. When the third album Survivor was released in May 2001, Luckett and Roberson filed a lawsuit claiming that the songs were aimed at them. The album debuted at number one on the U.S. Billboard 200, with first-week sales of 663,000 copies sold. The album spawned other number-one hits, ""Bootylicious"" and the title track, ""Survivor"", the latter of which earned the group a Grammy Award for Best R&B Performance by a Duo or Group with Vocals. After releasing their holiday album 8 Days of Christmas in October 2001, the group announced a hiatus to further pursue solo careers.","[independent women part i, 2000, charlie, angels, u.s., billboard hot, 100, eleven consecutive weeks, early 2001, destiny, third, beyonce, mtv, american, mekhi phifer, philadelphia, the 19th century, carmen, french, georges bizet, third, may 2001, luckett, roberson, one, u.s., first-week, 663,000, one, 8 days of christmas, october 2001]",The remaining band members recorded Independent Women Part I which appeared on the soundtrack to the 2000 film Charlie s Angels It became their best charting single topping the U S Billboard Hot 100 chart for eleven consecutive weeks In early 2001 while Destiny s Child was completing their third album Beyonce landed a major role in the MTV made for television film Carmen A Hip Hopera starring alongside American actor Mekhi Phifer Set in Philadelphia the film is a modern interpretation of the 19th century opera Carmen by French composer Georges Bizet When the third album Survivor was released in May 2001 Luckett and Roberson filed a lawsuit claiming that the songs were aimed at them The album debuted at number one on the U S Billboard 200 with first week sales of 663 000 copies sold The album spawned other number one hits Bootylicious and the title track Survivor the latter of which earned the group a Grammy Award for Best R B Performance by a Duo or Group with Vocals After releasing their holiday album 8 Days of Christmas in October 2001 the group announced a hiatus to further pursue solo careers,The remaining band members recorded Independent Women Part I appeared soundtrack 2000 film Charlie s Angels It became best charting single topping U S Billboard Hot 100 chart eleven consecutive weeks In early 2001 Destiny s Child completing third album Beyonce landed major role MTV made for television film Carmen A Hip Hopera starring alongside American actor Mekhi Phifer Set Philadelphia film modern interpretation 19th century opera Carmen French composer Georges Bizet When third album Survivor released May 2001 Luckett Roberson filed lawsuit claiming songs aimed them The album debuted number one U S Billboard 200 first week sales 663 000 copies sold The album spawned number one hits Bootylicious title track Survivor latter earned group Grammy Award Best R B Performance Duo Group Vocals After releasing holiday album 8 Days Christmas October 2001 group announced hiatus pursue solo careers,the remaining band members recorded independent women part i which appeared on the soundtrack to the 2000 film charlie s angels it became their best charting single topping the u s billboard hot 100 chart for eleven consecutive weeks in early 2001 while destiny s child was completing their third album beyonce landed a major role in the mtv made for television film carmen a hip hopera starring alongside american actor mekhi phifer set in philadelphia the film is a modern interpretation of the 19th century opera carmen by french composer georges bizet when the third album survivor was released in may 2001 luckett and roberson filed a lawsuit claiming that the songs were aimed at them the album debuted at number one on the u s billboard 200 with first week sales of 663 000 copies sold the album spawned other number one hits bootylicious and the title track survivor the latter of which earned the group a grammy award for best r b performance by a duo or group with vocals after releasing their holiday album 8 days of christmas in october 2001 the group announced a hiatus to further pursue solo careers,the remaining band members recorded independent women part i appeared soundtrack 2000 film charlie s angels it became best charting single topping u s billboard hot 100 chart eleven consecutive weeks in early 2001 destiny s child completing third album beyonce landed major role mtv made for television film carmen a hip hopera starring alongside american actor mekhi phifer set philadelphia film modern interpretation 19th century opera carmen french composer georges bizet when third album survivor released may 2001 luckett roberson filed lawsuit claiming songs aimed them the album debuted number one u s billboard 200 first week sales 663 000 copies sold the album spawned number one hits bootylicious title track survivor latter earned group grammy award best r b performance duo group vocals after releasing holiday album 8 days christmas october 2001 group announced hiatus pursue solo careers,independent women part i 2000 charlie angels u.s. billboard hot 100 eleven consecutive weeks early 2001 destiny third beyonce mtv american mekhi phifer philadelphia the 19th century carmen french georges bizet third may 2001 luckett roberson one u.s. first-week 663000 one 8 days of christmas october 2001
110,"In July 2002, Beyoncé continued her acting career playing Foxxy Cleopatra alongside Mike Myers in the comedy film, Austin Powers in Goldmember, which spent its first weekend atop the US box office and grossed $73 million. Beyoncé released ""Work It Out"" as the lead single from its soundtrack album which entered the top ten in the UK, Norway, and Belgium. In 2003, Beyoncé starred opposite Cuba Gooding, Jr., in the musical comedy The Fighting Temptations as Lilly, a single mother whom Gooding's character falls in love with. The film received mixed reviews from critics but grossed $30 million in the U.S. Beyoncé released ""Fighting Temptation"" as the lead single from the film's soundtrack album, with Missy Elliott, MC Lyte, and Free which was also used to promote the film. Another of Beyoncé's contributions to the soundtrack, ""Summertime"", fared better on the US charts.","[july 2002, beyonce, foxxy cleopatra, mike myers, austin powers, goldmember, first, us, $73 million, work it out, ten, uk, norway, belgium, 2003, beyonce, cuba gooding, jr., lilly, gooding, $30 million, u.s., fighting temptation, missy elliott, mc lyte, free, beyonce, summertime, us]",In July 2002 Beyonce continued her acting career playing Foxxy Cleopatra alongside Mike Myers in the comedy film Austin Powers in Goldmember which spent its first weekend atop the US box office and grossed 73 million Beyonce released Work It Out as the lead single from its soundtrack album which entered the top ten in the UK Norway and Belgium In 2003 Beyonce starred opposite Cuba Gooding Jr in the musical comedy The Fighting Temptations as Lilly a single mother whom Gooding s character falls in love with The film received mixed reviews from critics but grossed 30 million in the U S Beyonce released Fighting Temptation as the lead single from the film s soundtrack album with Missy Elliott MC Lyte and Free which was also used to promote the film Another of Beyonce s contributions to the soundtrack Summertime fared better on the US charts,In July 2002 Beyonce continued acting career playing Foxxy Cleopatra alongside Mike Myers comedy film Austin Powers Goldmember spent first weekend atop US box office grossed 73 million Beyonce released Work It Out lead single soundtrack album entered top ten UK Norway Belgium In 2003 Beyonce starred opposite Cuba Gooding Jr musical comedy The Fighting Temptations Lilly single mother Gooding s character falls love with The film received mixed reviews critics grossed 30 million U S Beyonce released Fighting Temptation lead single film s soundtrack album Missy Elliott MC Lyte Free also used promote film Another Beyonce s contributions soundtrack Summertime fared better US charts,in july 2002 beyonce continued her acting career playing foxxy cleopatra alongside mike myers in the comedy film austin powers in goldmember which spent its first weekend atop the us box office and grossed 73 million beyonce released work it out as the lead single from its soundtrack album which entered the top ten in the uk norway and belgium in 2003 beyonce starred opposite cuba gooding jr in the musical comedy the fighting temptations as lilly a single mother whom gooding s character falls in love with the film received mixed reviews from critics but grossed 30 million in the u s beyonce released fighting temptation as the lead single from the film s soundtrack album with missy elliott mc lyte and free which was also used to promote the film another of beyonce s contributions to the soundtrack summertime fared better on the us charts,in july 2002 beyonce continued acting career playing foxxy cleopatra alongside mike myers comedy film austin powers goldmember spent first weekend atop us box office grossed 73 million beyonce released work it out lead single soundtrack album entered top ten uk norway belgium in 2003 beyonce starred opposite cuba gooding jr musical comedy the fighting temptations lilly single mother gooding s character falls love with the film received mixed reviews critics grossed 30 million u s beyonce released fighting temptation lead single film s soundtrack album missy elliott mc lyte free also used promote film another beyonce s contributions soundtrack summertime fared better us charts,july 2002 beyonce foxxy cleopatra mike myers austin powers goldmember first us $73 million work it out ten uk norway belgium 2003 beyonce cuba gooding jr. lilly gooding $30 million u.s. fighting temptation missy elliott mc lyte free beyonce summertime us


In [71]:
df.to_pickle('cleaned_ner_23_mar.pkl')

## Read .pkl File

In [10]:
#Read the pickle file
df2 = pd.read_pickle('cleaned_ner_23_mar.pkl')

In [12]:
ner_vectorizer = TfidfVectorizer()
context_vectorizer = TfidfVectorizer()
ner_tfidf = ner_vectorizer.fit_transform(df2['ner_combined'])
context_tfidf = context_vectorizer.fit_transform(df2['cleaned_lowercase_nostop'])

In [67]:
' '.join([])

''

In [110]:
def get_ner_matching_ids(query):
    for_ner_matching = re.sub('[%s]' % re.escape(string.punctuation), '', query)
    for_ner_matching = re.sub('\s{2,}', " ", for_ner_matching).lower()
    for_ner_matching = ' '.join([word for word in for_ner_matching.split() if word not in stop_words])
    for_ner_matching = truecase.get_true_case(for_ner_matching)
    
    query_ners = ' '.join(generate_ners(for_ner_matching))
    if query_ners == '':
        return None
    print(f"query_ners: {query_ners}")
    
    ner_matching_ids = get_similar_doc(ner_vectorizer, ner_tfidf, query_ners)
    
    # to add: context tfidf
    
    return ner_matching_ids

In [112]:
# print(get_ner_matching_docs('when was donald trump elected'))
print(get_ner_matching_ids('What leader tried to unite all people considered themselves "German"'))

query_ners: german
[3444, 3453, 3454, 3456, 3457, 8879, 8881, 17800]


In [None]:
# def 
#     q1 = ner
#     q2 = tfidf context
    
#     if overlap:
#         return overlap
#     else:
        

## TF-IF Context Querying

In [None]:
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(df['cleaned_context'])

In [None]:
print(tfidf.shape)

In [119]:
def get_similar_docs_ner(vectorizer, docs_tfidf, query):
    # returns list of doc_ids using NER
    for_ner_matching = re.sub('[%s]' % re.escape(string.punctuation), '', query)
    for_ner_matching = re.sub('\s{2,}', " ", for_ner_matching).lower()
    for_ner_matching = ' '.join([word for word in for_ner_matching.split() if word not in stop_words])
    for_ner_matching = truecase.get_true_case(for_ner_matching)
    
    query_ners = ' '.join(generate_ners(for_ner_matching))
    if query_ners == '':
        return []
    
    ner_matching_ids = get_similar_doc(ner_vectorizer, ner_tfidf, query_ners)
    return ner_matching_ids
    
    return ner_matching_ids
def get_similar_docs_ngrams():
    # returns list of doc_ids using ngram-tfidf

def get_similar_docs(tfidfvectorizer, docs_tfidf, query):
    """
    vectorizer: TfIdfVectorizer model
    docs_tfidf: tfidf vectors for all docs
    query: query doc

    return: doc with highest tf-idf cosine similarity
    """
    query_tfidf = tfidfvectorizer.transform([query])
    cosineSimilarities = cosine_similarity(query_tfidf, docs_tfidf).flatten()
    
    doc_ids = []
    for idx, val in enumerate(cosineSimilarities):
        if val >= 0.7:
            doc_ids.append(idx)
            
    if len(doc_ids) == 0:
            
    return doc_ids

def get_relevant_sentence(doc_ids, query):
    print(doc_ids)
    docs = df2.iloc[doc_ids, df2.columns.get_loc('context')].apply(lambda x: [i.strip() for i in x.split('.') if len(i)>1])
    sentences = []
    for doc in docs:
        for sentence in doc:
            sentences.append(sentence)

    vectorizer = S2vectorizer()
    vectorizer.bert([query] + sentences)
    vectors = vectorizer.vectors
    query_vec, other_vec = vectors[0], vectors[1:]
    
    print("----- Sentences Retrieved: -----")
    for idx, sentence in enumerate(sentences):
        print(f"{idx}. {sentence}")
    
    distances = distance.cdist([query_vec], other_vec, "cosine")[0]
    min_index = np.argmin(distances)
#     min_distance = distances[min_index]
#     max_similarity = 1 - min_distance
    return sentences[min_index]
    
def get_answer(query):
    query = re.sub('[%s]' % re.escape(string.punctuation), '', query)
    query = re.sub('\s{2,}', " ", query).lower()
    most_similar_doc_ids = get_similar_doc(vectorizer, tfidf, query)
    return get_relevant_sentence(most_similar_doc_ids, query)

def get_answer_ner(query):
    ner_matching_ids = get_ner_matching_ids(query)
    docs = df2.iloc[ner_matching_ids, df2.columns.get_loc('context')].apply(lambda x: [i.strip() for i in x.split('.') if len(i)>1])
    return docs

In [129]:
get_answer_ner('when did beyonce release her multi-platinum second album')

query_ners: multiplatinum second


11919    [Water splitting, in which water is decomposed into its component protons, electrons, and oxygen, occurs in the light reactions in all photosynthetic organisms, Some such organisms, including the alga Chlamydomonas reinhardtii and cyanobacteria, have evolved a second step in the dark reactions in which protons and electrons are reduced to form H2 gas by specialized hydrogenases in the chloroplast, Efforts have been undertaken to genetically modify cyanobacterial hydrogenases to efficiently synthesize H2 gas even in the presence of oxygen, Efforts have also been undertaken with genetically modified alga in a bioreactor]                                                                                                                                                                                                                                                                                                                                                                      
15783 

## Gigi

### infersent

In [115]:
def clean_lowercase_punc_nostop(text):
    text = ' '.join([word for word in text.split() if word not in stop_words])
#     text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\s{2,}', " ", text)
    text = unidecode(text)
    return text.lower()

In [116]:
df_infersent = pd.read_csv('../data/SQuAD_csv.csv', encoding='utf-8').loc[:, ['context']]
df_infersent = df_infersent.drop_duplicates(subset=['context'])

df_infersent['context'] = df_infersent['context'].apply(lambda x: str(x).lower())
df_infersent['cleaned_lowercase_punc_nostop'] = df['context'].progress_apply(clean_lowercase_punc_nostop)

HBox(children=(FloatProgress(value=0.0, max=18877.0), HTML(value='')))




In [127]:
df_infersent.head()

Unnamed: 0,context,cleaned_lowercase_punc_nostop
0,"beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ bee-yon-say) (born september 4, 1981) is an american singer, songwriter, record producer and actress. born and raised in houston, texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of r&b girl-group destiny's child. managed by her father, mathew knowles, the group became one of the world's best-selling girl groups of all time. their hiatus saw the release of beyoncé's debut album, dangerously in love (2003), which established her as a solo artist worldwide, earned five grammy awards and featured the billboard hot 100 number-one singles ""crazy in love"" and ""baby boy"".","beyonce giselle knowles-carter (/bi:'jansei/ bee-yon-say) (born september 4, 1981) american singer, songwriter, record producer actress. born raised houston, texas, performed various singing dancing competitions child, rose fame late 1990s lead singer r&b girl-group destiny's child. managed father, mathew knowles, group became one world's best-selling girl groups time. hiatus saw release beyonce's debut album, dangerously love (2003), established solo artist worldwide, earned five grammy awards featured billboard hot 100 number-one singles ""crazy love"" ""baby boy""."
15,"following the disbandment of destiny's child in june 2005, she released her second solo album, b'day (2006), which contained hits ""déjà vu"", ""irreplaceable"", and ""beautiful liar"". beyoncé also ventured into acting, with a golden globe-nominated performance in dreamgirls (2006), and starring roles in the pink panther (2006) and obsessed (2009). her marriage to rapper jay z and portrayal of etta james in cadillac records (2008) influenced her third album, i am... sasha fierce (2008), which saw the birth of her alter-ego sasha fierce and earned a record-setting six grammy awards in 2010, including song of the year for ""single ladies (put a ring on it)"". beyoncé took a hiatus from music in 2010 and took over management of her career; her fourth album 4 (2011) was subsequently mellower in tone, exploring 1970s funk, 1980s pop, and 1990s soul. her critically acclaimed fifth studio album, beyoncé (2013), was distinguished from previous releases by its experimental production and exploration of darker themes.","following disbandment destiny's child june 2005, released second solo album, b'day (2006), contained hits ""deja vu"", ""irreplaceable"", ""beautiful liar"". beyonce also ventured acting, golden globe-nominated performance dreamgirls (2006), starring roles pink panther (2006) obsessed (2009). marriage rapper jay z portrayal etta james cadillac records (2008) influenced third album, am... sasha fierce (2008), saw birth alter-ego sasha fierce earned record-setting six grammy awards 2010, including song year ""single ladies (put ring it)"". beyonce took hiatus music 2010 took management career; fourth album 4 (2011) subsequently mellower tone, exploring 1970s funk, 1980s pop, 1990s soul. critically acclaimed fifth studio album, beyonce (2013), distinguished previous releases experimental production exploration darker themes."
27,"a self-described ""modern-day feminist"", beyoncé creates songs that are often characterized by themes of love, relationships, and monogamy, as well as female sexuality and empowerment. on stage, her dynamic, highly choreographed performances have led to critics hailing her as one of the best entertainers in contemporary popular music. throughout a career spanning 19 years, she has sold over 118 million records as a solo artist, and a further 60 million with destiny's child, making her one of the best-selling music artists of all time. she has won 20 grammy awards and is the most nominated woman in the award's history. the recording industry association of america recognized her as the top certified artist in america during the 2000s decade. in 2009, billboard named her the top radio songs artist of the decade, the top female artist of the 2000s and their artist of the millennium in 2011. time listed her among the 100 most influential people in the world in 2013 and 2014. forbes magazine also listed her as the most powerful female musician of 2015.","self-described ""modern-day feminist"", beyonce creates songs often characterized themes love, relationships, monogamy, well female sexuality empowerment. stage, dynamic, highly choreographed performances led critics hailing one best entertainers contemporary popular music. throughout career spanning 19 years, sold 118 million records solo artist, 60 million destiny's child, making one best-selling music artists time. 20 grammy awards nominated woman award's history. recording industry association america recognized top certified artist america 2000s decade. 2009, billboard named top radio songs artist decade, top female artist 2000s artist millennium 2011. time listed among 100 influential people world 2013 2014. forbes magazine also listed powerful female musician 2015."
39,"beyoncé giselle knowles was born in houston, texas, to celestine ann ""tina"" knowles (née beyincé), a hairdresser and salon owner, and mathew knowles, a xerox sales manager. beyoncé's name is a tribute to her mother's maiden name. beyoncé's younger sister solange is also a singer and a former member of destiny's child. mathew is african-american, while tina is of louisiana creole descent (with african, native american, french, cajun, and distant irish and spanish ancestry). through her mother, beyoncé is a descendant of acadian leader joseph broussard. she was raised in a methodist household.","beyonce giselle knowles born houston, texas, celestine ann ""tina"" knowles (nee beyince), hairdresser salon owner, mathew knowles, xerox sales manager. beyonce's name tribute mother's maiden name. beyonce's younger sister solange also singer former member destiny's child. mathew african-american, tina louisiana creole descent (with african, native american, french, cajun, distant irish spanish ancestry). mother, beyonce descendant acadian leader joseph broussard. raised methodist household."
52,"beyoncé attended st. mary's elementary school in fredericksburg, texas, where she enrolled in dance classes. her singing talent was discovered when dance instructor darlette johnson began humming a song and she finished it, able to hit the high-pitched notes. beyoncé's interest in music and performing continued after winning a school talent show at age seven, singing john lennon's ""imagine"" to beat 15/16-year-olds. in fall of 1990, beyoncé enrolled in parker elementary school, a music magnet school in houston, where she would perform with the school's choir. she also attended the high school for the performing and visual arts and later alief elsik high school. beyoncé was also a member of the choir at st. john's united methodist church as a soloist for two years.","beyonce attended st. mary's elementary school fredericksburg, texas, enrolled dance classes. singing talent discovered dance instructor darlette johnson began humming song finished it, able hit high-pitched notes. beyonce's interest music performing continued winning school talent show age seven, singing john lennon's ""imagine"" beat 15/16-year-olds. fall 1990, beyonce enrolled parker elementary school, music magnet school houston, would perform school's choir. also attended high school performing visual arts later alief elsik high school. beyonce also member choir st. john's united methodist church soloist two years."


In [130]:
infersent_vectorizer = TfidfVectorizer()
# infersent_tfidf = infersent_vectorizer.fit_transform(df['cleaned_lowercase_nostop'])
infersent_tfidf = infersent_vectorizer.fit_transform(df_infersent['cleaned_lowercase_punc_nostop'])

def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

def get_similar_doc_infersent(tfidfvectorizer, docs_tfidf, query):
    """
    vectorizer: TfIdfVectorizer model
    docs_tfidf: tfidf vectors for all docs
    query: query doc

    return: doc with highest tf-idf cosine similarity
    """
    
    query_tfidf = tfidfvectorizer.transform([query])
    cosineSimilarities = cosine_similarity(query_tfidf, docs_tfidf).flatten()
    cosineSimilarities = np.array(cosineSimilarities)
    return cosineSimilarities.argsort()[-3:][::-1]

def get_sentences(doc_ids):
#     docs = df.iloc[doc_ids, df.columns.get_loc('cleaned_lowercase_nostop')].apply(lambda x: [i.strip() for i in x.split('.') if len(i)>1])
    docs = df.iloc[doc_ids, df_infersent.columns.get_loc('cleaned_lowercase_punc_nostop')].apply(lambda x: [i.strip() for i in x.split('.') if len(i)>1])
    sentences = []
    sentences = []
    for doc in docs:
        for sentence in doc:
            sentences.append(sentence)
    return sentences

def infersent(sentences, query_vec):
    similarity = []
    for sent in sentences:
        sim = cosine(query_vec, model.encode([sent])[0])
        similarity.append((sent, sim))

    return similarity
# .sort(key=lambda x: x[1], reverse=True)

    
def get_answer_infersent(query):
    query = re.sub('[%s]' % re.escape(string.punctuation), '', query)
#     query = re.sub('\s{2,}', " ", query).lower()
    most_similar_doc_ids = get_similar_doc_infersent(infersent_vectorizer, infersent_tfidf, query)
    print(df_infersent.iloc[most_similar_doc_ids, df_infersent.columns.get_loc('context')])
    sentences = get_sentences(most_similar_doc_ids)
    model.build_vocab(sentences, tokenize=True)
    query_vec = model.encode(query)[0]
    
    answers = infersent(sentences, query_vec)
#     print(sorted(answers, key=lambda x: x[1]))
    return sorted(answers, key=lambda x: x[1], reverse=True)


In [None]:
# ! mkdir encoder
# ! curl -Lo encoder/infersent2.pkl https://dl.fbaipublicfiles.com/infersent/infersent2.pkl
  
# ! mkdir GloVe
# ! curl -Lo GloVe/glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip
# ! unzip GloVe/glove.840B.300d.zip -d GloVe/

In [123]:
from models import InferSent
import torch

V = 2
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

W2V_PATH = 'GloVe/glove.840B.300d.txt'
model.set_w2v_path(W2V_PATH)

In [131]:
get_answer_infersent("What kind of monarchy was formed under Queen Victoria")

16750    at certain times of the year, the queen mary 2, queen elizabeth and queen victoria may all visit southampton at the same time, in an event commonly called 'arrival of the three queens'.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
35150    through victoria's reign, the gradual establishment of a modern constitutional monarchy in britain continued. reforms 

[('certain times year queen mary 2 queen elizabeth queen victoria may visit southampton time event commonly called arrival three queens',
  0.03225669),
 ('victoria s reign gradual establishment modern constitutional monarchy britain continued reforms voting system increased power house commons expense house lords monarch 1867 walter bagehot wrote monarch retained the right consulted right encourage right warn victoria s monarchy became symbolic political placed strong emphasis morality family values contrast sexual financial personal scandals associated previous members house hanover discredited monarchy concept family monarchy burgeoning middle classes could identify solidified',
  -0.093323655),
 ('eleven days orsini s assassination attempt france victoria s eldest daughter married prince frederick william prussia london betrothed since september 1855 princess victoria 14 years old marriage delayed queen prince albert bride 17 queen albert hoped daughter son in law would liberalisin

In [133]:
get_answer_infersent("Where is Freedom Monument")

85934    the city is home to many monuments and memorials, most notably those along monument avenue. other monuments include the a.p. hill monument, the bill "bojangles" robinson monument in jackson ward, the christopher columbus monument near byrd park, and the confederate soldiers and sailors monument on libby hill. located near byrd park is the famous world war i memorial carillon, a 56-bell carillon tower. dedicated in 1956, the virginia war memorial is located on belvedere overlooking the river, and is a monument to virginians who died in battle in world war ii, the korean war, the vietnam war, the gulf war, the war in afghanistan, and the iraq war.                                                                                                                                                         
3381     the statue of liberty national monument and ellis island immigration museum are managed by the national park service and are in both the states of new york and new jersey. the

[("june 14 1987 5 000 people gathered freedom monument riga laid flowers commemorate anniversary stalin s mass deportation latvians 1941 first large demonstration baltic republics commemorate anniversary event contrary official soviet history authorities crack demonstrators encouraged larger demonstrations throughout baltic states next major anniversary august 23 molotov pact demonstration november 18 date latvia's independence 1918 november 18 1987 hundreds police civilian militiamen cordoned central square prevent demonstration freedom monument thousands lined streets riga silent protest regardless",
  -0.12536578),
 ('statue liberty national monument ellis island immigration museum managed national park service states new york new jersey joined harbor governors island national monument new york historic sites federal management manhattan island include castle clinton national monument federal hall national memorial theodore roosevelt birthplace national historic site general grant n

### trigrams tfidf

In [54]:
df = pd.read_csv('../data/SQuAD_csv.csv', encoding='utf-8').loc[:, ['context']]
df = df.drop_duplicates(subset=['context'])

df['context'] = df['context'].apply(lambda x: str(x).lower())
df['cleaned_lowercase_nostop'] = df['context'].progress_apply(clean_lowercase_nostop)

HBox(children=(FloatProgress(value=0.0, max=18877.0), HTML(value='')))




In [63]:
# Getting trigrams  
trigram_vectorizer = CountVectorizer(ngram_range = (3,3)) 
X1 = trigram_vectorizer.fit_transform(df['cleaned_lowercase_nostop'])  
features = (trigram_vectorizer.get_feature_names()) 
print("\n\nFeatures : \n", features) 
print("\n\nX1 : \n", X1.toarray()) 
  
# Applying TFIDF 
trigram_vectorizer = TfidfVectorizer(ngram_range = (3,3)) 
trigram_tfidf = trigram_vectorizer.fit_transform(df['cleaned_lowercase_nostop']) 
scores = (trigram_tfidf.toarray()) 
print("\n\nScores : \n", scores) 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)





X1 : 
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


Scores : 
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [64]:
def get_similar_doc_trigram(tfidfvectorizer, docs_tfidf, query):
    query_tfidf = tfidfvectorizer.transform([query])
    cosineSimilarities = cosine_similarity(query_tfidf, docs_tfidf).flatten()
    cosineSimilarities = np.array(cosineSimilarities)
    return cosineSimilarities.argsort()[-3:][::-1]

def get_trigram_matching_docs(query):
    for_trigram_matching = re.sub('[%s]' % re.escape(string.punctuation), '', query)
    for_trigram_matching = re.sub('\s{2,}', " ", for_trigram_matching).lower()
    for_trigram_matching = ' '.join([word for word in for_trigram_matching.split() if word not in stop_words])
    
    trigram_matching_ids = get_similar_doc_trigram(trigram_vectorizer, trigram_tfidf, for_trigram_matching)
    
    return df.iloc[trigram_matching_ids, df.columns.get_loc('context')]

In [78]:
print(get_trigram_matching_docs("When did Beyonce leave Destiny's Child and become a solo singer"))

86816    kathmandu metropolitan city (kmc), in order to promote international relations has established an international relations secretariat (irc). kmc's first international relationship was established in 1975 with the city of eugene, oregon, united states. this activity has been further enhanced by establishing formal relationships with 8 other cities: motsumoto city of japan, rochester of the usa, yangon (formerly rangoon) of myanmar, xi'an of the people's republic of china, minsk of belarus, and pyongyang of the democratic republic of korea. kmc's constant endeavor is to enhance its interaction with saarc countries, other international agencies and many other major cities of the world to achieve better urban management and developmental programs for kathmandu.              
28025    according to the writer of luke, mary was a relative of elizabeth, wife of the priest zechariah of the priestly division of abijah, who was herself part of the lineage of aaron and so of the tribe of 

### bigram

In [86]:
# Getting trigrams  
bigram_vectorizer = CountVectorizer(ngram_range = (1,2)) 
bigram_X1 = bigram_vectorizer.fit_transform(df['cleaned_lowercase_nostop'])  
bigram_features = (bigram_vectorizer.get_feature_names()) 
print("\n\nFeatures : \n", bigram_features) 
print("\n\nX1 : \n", bigram_X1.toarray()) 
  
# Applying TFIDF 
bigram_vectorizer = TfidfVectorizer(ngram_range = (1,2)) 
bigram_tfidf = bigram_vectorizer.fit_transform(df['cleaned_lowercase_nostop']) 
bigram_scores = (bigram_tfidf.toarray()) 
print("\n\nScores : \n", bigram_scores) 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)





X1 : 
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


Scores : 
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [87]:
def get_similar_doc_bigram(tfidfvectorizer, docs_tfidf, query):
    query_tfidf = tfidfvectorizer.transform([query])
    cosineSimilarities = cosine_similarity(query_tfidf, docs_tfidf).flatten()
    cosineSimilarities = np.array(cosineSimilarities)
    return cosineSimilarities.argsort()[-3:][::-1]

def get_bigram_matching_docs(query):
    for_bigram_matching = re.sub('[%s]' % re.escape(string.punctuation), '', query)
    for_bigram_matching = re.sub('\s{2,}', " ", for_bigram_matching).lower()
    for_bigram_matching = ' '.join([word for word in for_bigram_matching.split() if word not in stop_words])
    
    bigram_matching_ids = get_similar_doc_bigram(bigram_vectorizer, bigram_tfidf, for_bigram_matching)
    
    return df.iloc[bigram_matching_ids, df.columns.get_loc('context')]

In [92]:
print(get_bigram_matching_docs("Where did a protector put a notice because of Queen Victoria's lack of public appearances in March of 1864"))

70019    public notice is given through legal notices in newspapers, and communicated to state and county agencies within the species' area. foreign nations may also receive notice of a listing. a public hearing is mandatory if any person has requested one within 45 days of the published notice. "the purpose of the notice and comment requirement is to provide for meaningful public participation in the rulemaking process." summarized the ninth circuit court in the case of idaho farm bureau federation v. babbitt.                                                                                                                                                                                                                                                                        
34851    victoria's self-imposed isolation from the public diminished the popularity of the monarchy, and encouraged the growth of the republican movement. she did undertake her official government duties, yet chose to 

In [95]:
print(get_bigram_matching_docs("Who did Victoria try to convince Disraeli  to act against during the Russo-Turkish war"))

35079    between april 1877 and february 1878, she threatened five times to abdicate while pressuring disraeli to act against russia during the russo-turkish war, but her threats had no impact on the events or their conclusion with the congress of berlin. disraeli's expansionist foreign policy, which victoria endorsed, led to conflicts such as the anglo-zulu war and the second anglo-afghan war. "if we are to maintain our position as a first-rate power", she wrote, "we must ... be prepared for attacks and wars, somewhere or other, continually." victoria saw the expansion of the british empire as civilising and benign, protecting native peoples from more aggressive powers or cruel rulers: "it is not in our custom to annexe countries", she said, "unless we are obliged & forced to do so." to victoria's dismay, disraeli lost the 1880 general election, and gladstone returned as prime minister. when disraeli died the following year, she was blinded by "fast falling tears", and erected a memor