In [1]:
import re, nltk, spacy, gensim, string

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer;
from sklearn.decomposition import NMF;
from sklearn.preprocessing import normalize
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('df_clean.csv')

In [3]:
df.head()

Unnamed: 0,question_text,question_lemmatize
0,how did quebec nationalists see their province...,how do quebec nationalist see -PRON- province ...
1,do you have an adopted dog how would you encou...,do -PRON- have an adopt dog how would -PRON- e...
2,why does velocity affect time does velocity af...,why do velocity affect time do velocity affect...
3,how did otto von guericke used the magdeburg h...,how do otto von guericke use the magdeburg hem...
4,can i convert montra helicon d to a mountain b...,can i convert montra helicon d to a mountain b...


In [4]:
df['question_lemmatize_clean'] = df['question_lemmatize'].str.replace('-PRON-', '')

In [5]:
df.head(5)

Unnamed: 0,question_text,question_lemmatize,question_lemmatize_clean
0,how did quebec nationalists see their province...,how do quebec nationalist see -PRON- province ...,how do quebec nationalist see province as a n...
1,do you have an adopted dog how would you encou...,do -PRON- have an adopt dog how would -PRON- e...,do have an adopt dog how would encourage peo...
2,why does velocity affect time does velocity af...,why do velocity affect time do velocity affect...,why do velocity affect time do velocity affect...
3,how did otto von guericke used the magdeburg h...,how do otto von guericke use the magdeburg hem...,how do otto von guericke use the magdeburg hem...
4,can i convert montra helicon d to a mountain b...,can i convert montra helicon d to a mountain b...,can i convert montra helicon d to a mountain b...


In [2]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jupyter-susan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [9]:
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [10]:
df_nouns = pd.DataFrame(df.question_lemmatize_clean.apply(nouns))
df_nouns

Unnamed: 0,question_lemmatize_clean
0,nationalist province nation
1,do adopt dog people
2,velocity time velocity space geometry
3,guericke magdeburg hemisphere
4,helicon d mountain bike tyre
...,...
983796,facebook page page
983797,something
983798,cycle women cycle
983799,difference currency note rs rs currency note r...


In [13]:
df_nouns.to_csv('df_nouns.csv', index=False)

In [2]:
df_nouns = pd.read_csv('df_nouns.csv')

In [3]:
df_nouns.head()

Unnamed: 0,question_lemmatize_clean
0,nationalist province nation
1,do adopt dog people
2,velocity time velocity space geometry
3,guericke magdeburg hemisphere
4,helicon d mountain bike tyre


In [4]:
n_features = 4000
n_components = 20
n_top_words = 20

In [5]:
# ignore terms that have a document frequency strictly higher than 95%, 
# ignore terms that have a document frequency strictly lower than 2
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(df_nouns['question_lemmatize_clean'].values.astype(str))

In [6]:
# alpha=0 means no regularization, l1_ratio=.5, the penalty is a combination of L1 and L2
nmf = NMF(n_components=n_components, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)

In [7]:
nmf_output = nmf.fit_transform(tfidf)

In [8]:
def show_topics(vectorizer=tfidf_vectorizer, lda_model=nmf, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=tfidf_vectorizer, lda_model=nmf, n_words=20)        

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19
Topic 0,,ab,phd,phenomenon,philippine,philosopher,philosophy,phone,photo,photograph,photographer,photography,photon,photoshop,photosynthesis,php,physic,physician,physicist,physics
Topic 1,people,reason,money,group,friend,lot,relationship,word,problem,age,government,kind,religion,talk,think,state,love,family,look,medium
Topic 2,life,change,moment,meaning,experience,purpose,example,point,love,rest,lesson,death,event,story,goal,friend,dream,earth,movie,parent
Topic 3,way,money,online,language,business,home,girl,friend,weight,number,website,app,child,month,market,account,phone,guy,company,hair
Topic 4,thing,weekend,parent,today,mind,child,friend,need,love,guy,relationship,internet,girl,happen,learn,teacher,lot,kind,family,money
Topic 5,time,period,travel,sex,movie,friend,waste,money,month,space,girl,week,change,study,lot,relationship,place,hour,guy,number
Topic 6,use,language,word,app,type,technique,phone,device,software,number,car,water,method,tool,website,oil,advantage,datum,technology,company
Topic 7,year,girl,experience,age,date,guy,relationship,boy,month,change,engineering,money,exam,drop,start,company,paper,gap,course,parent
Topic 8,india,company,state,money,business,place,scope,cost,government,course,china,pakistan,service,car,online,minister,product,city,college,bank
Topic 9,person,friend,love,relationship,kind,date,dream,message,history,talk,number,personality,reason,word,phone,money,change,age,sex,type


In [9]:
Topics_theme = ['Word start from ph', 'People/Friend/Relationship', 'Life/Experience/Love/Purpose', 'Money/Internet/Business', 
                'Weekend/Parent/Child', 'Leisure time', 'Language/technique/software', 'Relationship/Girl/Boy', 
                'Business relate to India, China or Pakistan', 'Friend/Love/Relationship', 'Difference and similarity/Language/Engineering', 
                'Culture, travel and visa requirements in several countries', 'Tips on working as software engineering', 'Book/Movie/Class/History/Physics/Chemistry/Science', 
                'Software engineer job opportunitis in Canada', 'Love/Life/Relationship', 'World/War/Language/History', 'Day/Hour/Week/Month/Sex/Place', 'School/Student/College/University', 
                'Question/Answer/Quora/Interview']
df_topic_keywords['topic_theme'] = Topics_theme

In [10]:
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,...,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19,topic_theme
Topic 0,,ab,phd,phenomenon,philippine,philosopher,philosophy,phone,photo,photograph,...,photography,photon,photoshop,photosynthesis,php,physic,physician,physicist,physics,Word start from ph
Topic 1,people,reason,money,group,friend,lot,relationship,word,problem,age,...,kind,religion,talk,think,state,love,family,look,medium,People/Friend/Relationship
Topic 2,life,change,moment,meaning,experience,purpose,example,point,love,rest,...,death,event,story,goal,friend,dream,earth,movie,parent,Life/Experience/Love/Purpose
Topic 3,way,money,online,language,business,home,girl,friend,weight,number,...,app,child,month,market,account,phone,guy,company,hair,Money/Internet/Business
Topic 4,thing,weekend,parent,today,mind,child,friend,need,love,guy,...,internet,girl,happen,learn,teacher,lot,kind,family,money,Weekend/Parent/Child
Topic 5,time,period,travel,sex,movie,friend,waste,money,month,space,...,week,change,study,lot,relationship,place,hour,guy,number,Leisure time
Topic 6,use,language,word,app,type,technique,phone,device,software,number,...,water,method,tool,website,oil,advantage,datum,technology,company,Language/technique/software
Topic 7,year,girl,experience,age,date,guy,relationship,boy,month,change,...,money,exam,drop,start,company,paper,gap,course,parent,Relationship/Girl/Boy
Topic 8,india,company,state,money,business,place,scope,cost,government,course,...,pakistan,service,car,online,minister,product,city,college,bank,"Business relate to India, China or Pakistan"
Topic 9,person,friend,love,relationship,kind,date,dream,message,history,talk,...,personality,reason,word,phone,money,change,age,sex,type,Friend/Love/Relationship


In [11]:
df_topic_keywords.set_index('topic_theme', inplace=True)

In [12]:
df_topic_keywords.T

topic_theme,Word start from ph,People/Friend/Relationship,Life/Experience/Love/Purpose,Money/Internet/Business,Weekend/Parent/Child,Leisure time,Language/technique/software,Relationship/Girl/Boy,"Business relate to India, China or Pakistan",Friend/Love/Relationship,Difference and similarity/Language/Engineering,"Culture, travel and visa requirements in several countries",Tips on working as software engineering,Book/Movie/Class/History/Physics/Chemistry/Science,Software engineer job opportunitis in Canada,Love/Life/Relationship,World/War/Language/History,Day/Hour/Week/Month/Sex/Place,School/Student/College/University,Question/Answer/Quora/Interview
Word 0,,people,life,way,thing,time,use,year,india,person,difference,country,work,book,job,woman,world,day,school,question
Word 1,ab,reason,change,money,weekend,period,language,girl,company,friend,similarity,state,tip,movie,company,man,war,period,student,quora
Word 2,phd,money,moment,online,parent,travel,word,experience,state,love,state,china,company,preparation,engineering,girl,place,hour,college,answer
Word 3,phenomenon,group,meaning,language,today,sex,app,age,money,relationship,word,war,experience,class,engineer,sex,cup,week,university,interview
Word 4,philippine,friend,experience,business,mind,movie,type,date,business,kind,term,government,home,device,experience,guy,today,month,engineering,ask
Word 5,philosopher,lot,purpose,home,child,friend,technique,guy,place,date,language,europe,engineer,exam,opportunity,friend,language,sex,class,topic
Word 6,philosophy,relationship,example,girl,friend,waste,phone,relationship,scope,dream,vs,citizen,software,history,government,date,history,place,study,paper
Word 7,phone,word,point,friend,need,money,device,boy,cost,message,engineering,language,hour,character,pay,relationship,look,night,science,type
Word 8,photo,problem,love,weight,love,month,software,month,government,history,number,america,business,read,degree,love,city,water,course,account
Word 9,photograph,age,rest,number,guy,space,number,change,course,talk,computer,compare,visa,inspire,software,age,change,exam,business,number


In [13]:
# Create Document - Topic Matrix
lda_output = nmf.transform(tfidf)

# column names
topicnames = df_topic_keywords.T.columns
# topicnames = ["Topic" + str(i) for i in range(20)]

# index names
docnames = ["Doc" + str(i) for i in range(len(df_nouns))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

In [14]:
df_document_topic

topic_theme,Word start from ph,People/Friend/Relationship,Life/Experience/Love/Purpose,Money/Internet/Business,Weekend/Parent/Child,Leisure time,Language/technique/software,Relationship/Girl/Boy,"Business relate to India, China or Pakistan",Friend/Love/Relationship,...,"Culture, travel and visa requirements in several countries",Tips on working as software engineering,Book/Movie/Class/History/Physics/Chemistry/Science,Software engineer job opportunitis in Canada,Love/Life/Relationship,World/War/Language/History,Day/Hour/Week/Month/Sex/Place,School/Student/College/University,Question/Answer/Quora/Interview,dominant_topic
Doc0,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Doc1,0.0,0.03,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
Doc2,0.0,0.00,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
Doc3,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Doc4,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Doc983796,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Doc983797,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Doc983798,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Doc983799,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10


In [15]:
df_document_topic.reset_index(inplace=True)
df_sent_topic= pd.merge(df_nouns, df_document_topic, left_index=True, right_index=True)
df_sent_topic.drop('index', axis=1, inplace=True)

In [16]:
df_sent_topic

Unnamed: 0,question_lemmatize_clean,Word start from ph,People/Friend/Relationship,Life/Experience/Love/Purpose,Money/Internet/Business,Weekend/Parent/Child,Leisure time,Language/technique/software,Relationship/Girl/Boy,"Business relate to India, China or Pakistan",...,"Culture, travel and visa requirements in several countries",Tips on working as software engineering,Book/Movie/Class/History/Physics/Chemistry/Science,Software engineer job opportunitis in Canada,Love/Life/Relationship,World/War/Language/History,Day/Hour/Week/Month/Sex/Place,School/Student/College/University,Question/Answer/Quora/Interview,dominant_topic
0,nationalist province nation,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,do adopt dog people,0.0,0.03,0.0,0.0,0.0,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,velocity time velocity space geometry,0.0,0.00,0.0,0.0,0.0,0.01,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
3,guericke magdeburg hemisphere,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,helicon d mountain bike tyre,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
983796,facebook page page,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
983797,something,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
983798,cycle women cycle,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
983799,difference currency note rs rs currency note r...,0.0,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10


In [17]:
df_topic_theme = df_sent_topic[['question_lemmatize_clean', 'dominant_topic']]

In [18]:
df_topic_theme.head(10)

Unnamed: 0,question_lemmatize_clean,dominant_topic
0,nationalist province nation,0
1,do adopt dog people,1
2,velocity time velocity space geometry,5
3,guericke magdeburg hemisphere,0
4,helicon d mountain bike tyre,0
5,dachau treblinka,0
6,opinion report view,0
7,,0
8,thing dress dress,4
9,phase people feelingslive something way thing ...,1


In [19]:
def label_theme(row):
    if row['dominant_topic'] == 0 :
        return 'Word start from ph'
    if row['dominant_topic'] == 1 :
        return 'People/Friend/Relationship'
    if row['dominant_topic'] == 2 :
        return 'Life/Experience/Love/Purpose'
    if row['dominant_topic'] == 3:
        return 'Money/Internet/Business'
    if row['dominant_topic']  == 4:
        return 'Weekend/Parent/Child'
    if row['dominant_topic'] == 5:
        return 'Leisure time'
    if row['dominant_topic'] == 6:
        return 'Language/technique/software'
    if row['dominant_topic'] == 7:
        return 'Relationship/Girl/Boy'
    if row['dominant_topic'] == 8:
        return 'Business relate to India, China or Pakistan'
    if row['dominant_topic'] == 9:
        return 'Friend/Love/Relationship'
    if row['dominant_topic'] == 10:
        return 'Difference and similarity/Language/Engineering'
    if row['dominant_topic'] == 11:
        return 'Culture, travel and visa requirements in several countries'
    if row['dominant_topic'] == 12:
        return 'Tips on working as software engineering'
    if row['dominant_topic'] == 13:
        return 'Book/Movie/Class/History/Physics/Chemistry/Science'
    if row['dominant_topic'] == 14:
        return 'Software engineer job opportunitis in Canada'
    if row['dominant_topic'] == 15:
        return 'Love/Life/Relationship'
    if row['dominant_topic'] == 16:
        return 'World/War/Language/History'
    if row['dominant_topic'] == 17:
        return 'Day/Hour/Week/Month/Sex/Place'
    if row['dominant_topic'] == 18:
        return 'School/Student/College/University'
    if row['dominant_topic'] == 19:
        return 'Question/Answer/Quora/Interview'
df_topic_theme['dominant_topic_theme'] = df_topic_theme.apply (lambda row: label_theme(row), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [20]:
df_topic_theme.tail(15)

Unnamed: 0,question_lemmatize_clean,dominant_topic,dominant_topic_theme
983786,anyone chevrolet car chevrolet exit experience,0,Word start from ph
983787,phone i weekend weekday,0,Word start from ph
983788,series doctor,0,Word start from ph
983789,,0,Word start from ph
983790,country continent,11,"Culture, travel and visa requirements in sever..."
983791,statistic use operation research,6,Language/technique/software
983792,leak gasket engine,0,Word start from ph
983793,opportunity manager marketing operation backgr...,0,Word start from ph
983794,woman chess player,15,Love/Life/Relationship
983795,college,18,School/Student/College/University


In [29]:
df = pd.read_csv('quora_challenge.csv')

In [30]:
df['question_text'][3]

'How did Otto von Guericke used the Magdeburg hemispheres?'

In [31]:
df['question_text'][12]

'What is the dumbest, yet possibly true explanation for Trump being elected?'

In [21]:
# Tokenize and Clean-up using gensim’s simple_preprocess
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

# Lemmatization and remove pronouns
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

In [22]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# We will not assign dependency labels, and we will not lable named entities. And we need pos. 

nlp = spacy.load('en', disable=['parser', 'ner'])

# Define function to predict topic for a given new question.
def predict_topic(text, nlp=nlp):
    global sent_to_words
    global lemmatization

    # Step 1: Clean with simple_preprocess
    mytext_2 = list(sent_to_words(text))

    # Step 2: Lemmatize
    mytext_3 = lemmatization(mytext_2, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # Step 3: Vectorize transform
    mytext_4 = tfidf_vectorizer.transform(mytext_3)

    # Step 4: LDA Transform
    topic_probability_scores = nmf.transform(mytext_4)
    topic = df_topic_keywords.iloc[np.argmax(topic_probability_scores), :].values.tolist()
    return topic, topic_probability_scores

# Predict the topic
mytext = ["Just like Larry Page and Sergey Brin unseated their incumbents with a better search engine, how likely is it that two Computer Science PhD students create a search engine that unseats Google? How vulnerable is Google to this possibility"]
topic, prob_scores = predict_topic(text = mytext)
print(topic)

['school', 'student', 'college', 'university', 'engineering', 'class', 'study', 'science', 'course', 'business', 'computer', 'friend', 'state', 'exam', 'money', 'admission', 'language', 'mark', 'rank', 'program']


In [23]:
topic, prob_scores

(['school',
  'student',
  'college',
  'university',
  'engineering',
  'class',
  'study',
  'science',
  'course',
  'business',
  'computer',
  'friend',
  'state',
  'exam',
  'money',
  'admission',
  'language',
  'mark',
  'rank',
  'program'],
 array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 1.89518728e-04, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 1.04084621e-05, 4.92109835e-04, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 5.56300193e-03, 3.80650193e-07]]))

In [24]:
df_topic_keywords.T['School/Student/College/University']

Word 0          school
Word 1         student
Word 2         college
Word 3      university
Word 4     engineering
Word 5           class
Word 6           study
Word 7         science
Word 8          course
Word 9        business
Word 10       computer
Word 11         friend
Word 12          state
Word 13           exam
Word 14          money
Word 15      admission
Word 16       language
Word 17           mark
Word 18           rank
Word 19        program
Name: School/Student/College/University, dtype: object