In [2]:
import numpy as np

import pandas as pd

In [6]:
data = pd.read_csv("quora_questions.csv")

In [7]:
data.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [8]:
data.shape

(404289, 1)

In [9]:
data["Question"][0]

'What is the step by step guide to invest in share market in india?'

### Creating Vectorize Document Term Matrix

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
tfidf = TfidfVectorizer(max_df= 0.95 , min_df=2 , stop_words="english")

In [12]:
dtm = tfidf.fit_transform(data["Question"])

In [13]:
dtm

<404289x38669 sparse matrix of type '<class 'numpy.float64'>'
	with 2002912 stored elements in Compressed Sparse Row format>

### Non Negative Matrix Factorization

In [14]:
from sklearn.decomposition import NMF

In [15]:
nmf = NMF(n_components=20 , random_state=42)

In [16]:
nmf.fit(dtm)



NMF(n_components=20, random_state=42)

### Printing the top 15 most common words for each of the 20 topics.

In [18]:
for i, topic in enumerate(nmf.components_):
    
    print(f"THE TOP 15 WORD FOR TOPIC {i} ARE : ")
    
    print([tfidf.get_feature_names()[index] for index in topic.argsort()[-15:]])
           
    print("\n")

THE TOP 15 WORD FOR TOPIC 0 ARE : 




['thing', 'read', 'place', 'visit', 'places', 'phone', 'buy', 'laptop', 'movie', 'ways', '2016', 'books', 'book', 'movies', 'best']


THE TOP 15 WORD FOR TOPIC 1 ARE : 
['majors', 'recruit', 'sex', 'looking', 'differ', 'use', 'exist', 'really', 'compare', 'cost', 'long', 'feel', 'work', 'mean', 'does']


THE TOP 15 WORD FOR TOPIC 2 ARE : 
['add', 'answered', 'needing', 'post', 'easily', 'improvement', 'delete', 'asked', 'google', 'answers', 'answer', 'ask', 'question', 'questions', 'quora']


THE TOP 15 WORD FOR TOPIC 3 ARE : 
['using', 'website', 'investment', 'friends', 'black', 'internet', 'free', 'home', 'easy', 'youtube', 'ways', 'earn', 'online', 'make', 'money']


THE TOP 15 WORD FOR TOPIC 4 ARE : 
['balance', 'earth', 'day', 'death', 'changed', 'live', 'want', 'change', 'moment', 'real', 'important', 'thing', 'meaning', 'purpose', 'life']


THE TOP 15 WORD FOR TOPIC 5 ARE : 
['reservation', 'engineering', 'minister', 'president', 'company', 'china', 'business', 'country', 'olym

### Assigning Topics

In [19]:
topic_results = nmf.transform(dtm)

In [20]:
data["Topic"] = topic_results.argmax(axis = 1)

In [21]:
data.head()

Unnamed: 0,Question,Topic
0,What is the step by step guide to invest in sh...,5
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,16
2,How can I increase the speed of my internet co...,17
3,Why am I mentally very lonely? How can I solve...,11
4,"Which one dissolve in water quikly sugar, salt...",14


In [22]:
data[data["Topic"] == 16]

Unnamed: 0,Question,Topic
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,16
56,Who is israil friend?,16
107,What's the difference between love and pity?,16
181,How can I stop being addicted to love?,16
243,What are the signs of an ultra smart person pl...,16
...,...,...
404173,Is Kapil Sharma married?,16
404183,I am an introvert and couldnt guess the right ...,16
404190,Do you have an imaginary friend? What is your ...,16
404202,Which professors from IIT Guwahati would you l...,16
