In [8]:
#import libraries

import numpy as np
import pandas as pd
import nltk
from bs4 import BeautifulSoup
import random
import re
import string

In [9]:
#load preprocessed data

X = pd.read_feather('./X.ftr')
y = pd.read_feather('./tags.ftr')

In [10]:
#get count of all tags and select top K tags

num_tags = 100
tags_count = y.groupby('Tag')['Id'].nunique().sort_values(ascending=False).reset_index(name='count').head(num_tags)
tags_to_use = tags_count['Tag']

In [11]:
#select K questions per tag and form a set of questions over all tags

num_questions_per_tag = 1000
question_ids = set()
individual_tag_question_ids = []
for tag in tags_to_use:
    tag_question_ids = []
    count, i = 0, 0
    while count<num_questions_per_tag and i<y.shape[0]:
        if y.loc[y.index[i], 'Tag']==tag:
            tag_question_ids.append(y.loc[y.index[i], 'Id'])
            count+=1
        i+=1
    individual_tag_question_ids.append(tag_question_ids)
    question_ids.update(tag_question_ids)

#get questions having the selected tags using the above question ids set

X_with_valid_tags = X[X['Id'].isin(question_ids)]

In [14]:
#select top K words from question set per tag and form overall set of words to be used as final features

num_words_per_tag = 100
word_features = set()
for tag_question_ids in individual_tag_question_ids:
    tag_word_features = {}
    for i in range(X_with_valid_tags.shape[0]):
        if X_with_valid_tags.loc[X_with_valid_tags.index[i], 'Id'] in tag_question_ids:
            for word in nltk.tokenize.word_tokenize(X_with_valid_tags.iloc[i, 0]):
                if word not in tag_word_features:
                    tag_word_features[word] = 1
                else:
                    tag_word_features[word] += 1
            for word in nltk.tokenize.word_tokenize(X_with_valid_tags.iloc[i, 1]):
                if word not in tag_word_features:
                    tag_word_features[word] = 1
                else:
                    tag_word_features[word] += 1
    tag_word_features = sorted(tag_word_features.items(), key=lambda item: item[1], reverse=True)
    tag_word_features = [item[0] for item in tag_word_features[:num_words_per_tag]]
    word_features.update(tag_word_features)

for word in word_features:
    X_with_valid_tags[word] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [15]:
#populate the word feature columns based on whether it appears in title or body of question

for i in range(X_with_valid_tags.shape[0]):
    for word in nltk.tokenize.word_tokenize(X_with_valid_tags.iloc[i, 0]):
        if word in word_features:
            X_with_valid_tags.loc[X_with_valid_tags.index[i], word] = 1
    for word in nltk.tokenize.word_tokenize(X_with_valid_tags.iloc[i, 1]):
        if word in word_features:
            X_with_valid_tags.loc[X_with_valid_tags.index[i], word] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [16]:
#form the target labels(tags) columns

y_with_valid_tags = pd.DataFrame()
for tag in tags_to_use:
    y_with_valid_tags[tag] = 0
    
#populate target label columns based on whether the question was assigned the label

for i in range(X_with_valid_tags.shape[0]):
    for j in range(len(individual_tag_question_ids)):
        if X_with_valid_tags.loc[X_with_valid_tags.index[i], 'Id'] in individual_tag_question_ids[j]:
            y_with_valid_tags.loc[X_with_valid_tags.index[i], tags_to_use[j]] = 1
y_with_valid_tags = y_with_valid_tags.fillna(0)

In [17]:
#save the dataframes as csvs

X_with_valid_tags.to_csv('./X_with_valid_tags.csv', index=False)
y_with_valid_tags.to_csv('./y_with_valid_tags.csv', index=False)