In [59]:
import pandas as pd
import lxml
from bs4 import BeautifulSoup

# Read CSV files to get questions and tags
df_questions = pd.read_csv("Questions.csv", encoding="ISO-8859-1")
df_tags = pd.read_csv("Tags.csv", encoding="ISO-8859-1", dtype={'Tag': str})

In [60]:
df_tags['Tag'] = df_tags['Tag'].astype(str)
grouped_tags = df_tags.groupby("Id")['Tag'].apply(lambda tags: ' '.join(tags))

grouped_tags.reset_index()
grouped_tags_final = pd.DataFrame({'Id':grouped_tags.index, 'Tags':grouped_tags.values})

#df_questions.drop(columns=['OwnerUserId', 'CreationDate', 'ClosedDate'], inplace=True)

df = df_questions.merge(grouped_tags_final, on='Id')

df_sample = df.sample(n=100000,random_state = 1)
#df_sample = df.sample(n=100,random_state = 1)

df_sample.head(5)

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body,Tags
345874,12395950,795000.0,2012-09-12T20:45:35Z,2016-10-05T20:53:44Z,2,Calling Twain driver from 64 bit application,<p>I need to call TWAIN API from .NET (C#) app...,c# .net twain
654403,22426350,1378959.0,2014-03-15T16:17:17Z,,0,Why does variable in struct instance change af...,<p>This is really weird.</p>\n\n<p>I have a st...,c struct
574386,19842160,2429569.0,2013-11-07T17:07:34Z,,0,dynamically creating text fields lands up in d...,"<p>I'm having issues with the below code,\nwhe...",javascript jquery html
671187,22953000,3513576.0,2014-04-09T04:48:04Z,,0,Javascript in PHP/HTML hiding Div not working,<p>I am new to coding so be gentle. I am buil...,javascript php html
1088725,35528560,1712334.0,2016-02-20T20:24:29Z,,0,Design a compareTo that can sort in ascending ...,<p>I have a class States that is an object tha...,java


In [61]:
#remove html stamp, stopword, punctuation, lemmatize words
#import nltk
#nltk.download()
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer
import re


lemma = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

#keep tags which appears more than 100 times
tag_count = df_tags['Tag'].value_counts(ascending=False).rename_axis('Tag').reset_index(name='count')
most_common_tag = list(tag_count.loc[(tag_count['count'] >= 100)]['Tag'])

def clean_html(x):
    return BeautifulSoup(x, 'lxml').get_text()

def clean_stopwords_punctuations(x):
    
    word_tokens = word_tokenize(x)
    filtered = [w for w in word_tokens if not w in stop_words]
    
    tags = list(df_sample['Tags'])
    punct = '!"$%&\'()*,./:;<=>?@[\\]^_`{|}~'
    punctuation_filtered = []
    r = re.compile('[%s]' % re.escape(punct))
    
    
    #keep the punctuation in most common tags
    for w in filtered:
        if w in most_common_tag:
            punctuation_filtered.append(w)
        else:
            punctuation_filtered.append(r.sub('', w))
    '''
    for w in filtered:
        punctuation_filtered.append(r.sub('', w))
    '''
    
    return ' '.join(map(str, punctuation_filtered))
 
    
def lemma_word(x):
    words_tokens = word_tokenize(x) 
    listLemma=[]
    for w in words_tokens:
        w=lemma.lemmatize(w, pos="v")
        listLemma.append(w.lower())
    return(' '.join(map(str, listLemma)))  


df_sample['Body'] = df_sample['Body'].apply(lambda x: clean_html(x)) 
df_sample['Body'] = df_sample['Body'].apply(lambda x: clean_stopwords_punctuations(x))
df_sample['Body'] = df_sample['Body'].apply(lambda x: lemma_word(x)) 

df_sample['Body'] = df_sample['Title'].apply(lambda x: clean_html(x)) 
df_sample['Body'] = df_sample['Title'].apply(lambda x: clean_stopwords_punctuations(x)) 
df_sample['Body'] = df_sample['Title'].apply(lambda x: lemma_word(x))
    
df_sample.head(5)
df_sample.to_csv('data_no_rake.csv', index = None)

In [70]:
##test: split train and test dataset
from sklearn.model_selection import train_test_split
test = pd.read_csv("data_no_rake.csv")
X = test[['Title','Body']]
y = test['Tags']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

In [68]:
#use rake to preprocess
from rake_nltk import Rake

r = Rake()
def rake_process(x):
    r.extract_keywords_from_text(x)
    keywords = r.get_ranked_phrases()
    return(' '.join(map(str, keywords)))

df_sample['Body'] = df_sample['Body'].apply(lambda x: clean_html(x)) 
df_sample['Body'] = df_sample['Body'].apply(lambda x: rake_process(x)) 

df_sample['Title'] = df_sample['Body'].apply(lambda x: clean_html(x)) 
df_sample['Title'] = df_sample['Body'].apply(lambda x: rake_process(x))

df_sample.head(5)
df_sample.to_csv('data_rake.csv', index = None)

In [None]:
##test
test = pd.read_csv("data_rake.csv")
X = test[['Title','Body']]
y = test['Tags']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)