# HW5 Skeleton Code
Please note that this skeleton code is provided to help you with homework.
Full description of each question can be found on HW5.pdf, so please read instruction of each question carefully. There might be some questions that is not presented in this code.

In [1]:
import os
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

## Q. Changing HTML Text to Plain Text

The Python library <b>BeautifulSoup</b> is useful for dealing with html text. In order to use this library, you will need to install it first by running the following command: 
 <b>conda install beautifulsoup4</b> 
 in the terminal.
 <br> In the code, you can import it by running the following line: 
<br> 
  <b>from bs4 import BeautifulSoup </b>
<br>

In [2]:
  #Read our data file
df_train = pd.read_csv(r'stack_stats_2023_train.csv') #Todo
df_test = pd.read_csv(r'stack_stats_2023_test.csv') #Todo

df_train['Tags']

0        <machine-learning><reinforcement-learning><q-l...
1                      <probability><law-of-large-numbers>
2                                       <cross-validation>
3                        <unbalanced-classes><calibration>
4        <multiple-regression><missing-data><likert><it...
                               ...                        
19242    <machine-learning><data-imputation><recommende...
19243    <regression><modeling><measurement-error><erro...
19244                               <poisson-distribution>
19245    <machine-learning><mathematical-statistics><cu...
19246                                  <r><bioinformatics>
Name: Tags, Length: 19247, dtype: object

In [6]:
#Cleaning 'Body'
#Change HTML Text to Plain text using get_text() function from BeautifulSoup
#If you are not familiar with the apply method, please check discussion week 10 lecture and code.

#df_train['Body'] = df_train['Body'].apply(lambda x: soup.get_text(x))
#Manually cleaned up newline tag \n and tab tag \t.
df_train['Body'] = df_train['Body'].apply(lambda x: BeautifulSoup(x).get_text().replace('/n', '')) #Todo
#If you need any other cleaning process, please uncomment the below.
#df_train['Body'] = df_train['Body'].apply(lambda ) #Todo

#Cleaning Tags
#This would be somewhat similar to the above.
#df_train['Tags'] = df_train['Tags'].apply(lambda x: soup.get_text(x)) #Todo
#Manually cleaned up newline tag \n and tab tag \t.
df_train['Tags'] = df_train['Tags'].apply(lambda x: BeautifulSoup(x).get_text().replace('/n', ''))


#Todo: Repeat the same process for test dataset 
df_test['Body'] = df_test['Body'].apply(lambda x: BeautifulSoup(x).get_text().replace('/n', ''))
df_test['Tags'] = df_test['Tags'].apply(lambda x: BeautifulSoup(x).get_text().replace('/n', ''))


df_train['Tags']

0         
1         
2         
3         
4         
        ..
19242     
19243     
19244     
19245     
19246     
Name: Tags, Length: 19247, dtype: object

## Q. Basic Text Cleaning and Merging into a single Text data

### Change to Lower Case, Remove puncuation, digits, 

In [4]:
#Change to Lowercase

df_train[['Body','Title','Tags']] = df_train[['Body','Title','Tags']].applymap(lambda x: x.lower()) #Todo, do you see why we used applymap instead of apply in this case? 
df_test[['Body','Title','Tags']] = df_test[['Body','Title','Tags']].applymap(lambda x: x.lower()) #Todo

df_train['Tags']

0         
1         
2         
3         
4         
        ..
19242     
19243     
19244     
19245     
19246     
Name: Tags, Length: 19247, dtype: object

In [5]:
#Remove Punctations 
from string import punctuation

#You can get this function from our discussion session code. However, we leave it as a blank for a practice.
def remove_punctuation(document):
    
    no_punct = ''.join([character for character in document if character not in punctuation])#Todo

    return no_punct

df_train['Body'] = df_train['Body'].apply(remove_punctuation)
df_train['Title'] = df_train['Title'].apply(remove_punctuation)
df_train['Tags'] = df_train['Tags'].apply(remove_punctuation)
df_test['Body'] = df_test['Body'].apply(remove_punctuation)
df_test['Title'] = df_test['Title'].apply(remove_punctuation)
df_test['Tags'] = df_test['Tags'].apply(remove_punctuation)
#df_train[['Body','Title','Tags']] = df_train[['Body','Title','Tags']].apply(remove_punctuation)#Todo 
#df_test[['Body','Title','Tags']] = df_test[['Body','Title','Tags']].apply(remove_punctuation)#Todo

df_train['Tags']

0         
1         
2         
3         
4         
        ..
19242     
19243     
19244     
19245     
19246     
Name: Tags, Length: 19247, dtype: object

In [6]:
#Remove Digits 

def remove_digit(document): 
    
    no_digit = ''.join([character for character in document if not character.isdigit()])#Todo
              
    return no_digit

df_train['Body'] = df_train['Body'].apply(remove_digit)
df_train['Title'] = df_train['Title'].apply(remove_digit)
df_train['Tags'] = df_train['Tags'].apply(remove_digit)
df_test['Body'] = df_test['Body'].apply(remove_digit)
df_test['Title'] = df_test['Title'].apply(remove_digit)
df_test['Tags'] = df_test['Tags'].apply(remove_digit)
#df_train[['Body','Title','Tags']] = df_train[['Body','Title','Tags']].apply(remove_digit)#Todo 
#df_test[['Body','Title','Tags']] = df_test[['Body','Title','Tags']].apply(remove_digit)#Todo

df_train['Tags']

0         
1         
2         
3         
4         
        ..
19242     
19243     
19244     
19245     
19246     
Name: Tags, Length: 19247, dtype: object

### Tokenization and Remove Stopwords and do stemming

In [19]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

df_train['Body'] = df_train['Body'].apply(word_tokenize)
df_train['Title'] = df_train['Title'].apply(word_tokenize)
df_train['Tags'] = df_train['Tags'].apply(word_tokenize)
df_test['Body'] = df_test['Body'].apply(word_tokenize)
df_test['Title'] = df_test['Title'].apply(word_tokenize)
df_test['Tags'] = df_test['Tags'].apply(word_tokenize)
#df_train[['Body','Title','Tags']] = df_train[['Body','Title','Tags']].apply(word_tokenize)#Todo 
#df_test[['Body','Title','Tags']] = df_test[['Body','Title','Tags']].apply(word_tokenize)#Todo

df_train['Tags']

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0        []
1        []
2        []
3        []
4        []
         ..
19242    []
19243    []
19244    []
19245    []
19246    []
Name: Tags, Length: 19247, dtype: object

In [20]:
#Remove Stopwords

from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(document):
    
    words = [word for word in document if not word in stop_words]#Todo
    
    return words

df_train['Body'] = df_train['Body'].apply(remove_stopwords)
df_train['Title'] = df_train['Title'].apply(remove_stopwords)
df_train['Tags'] = df_train['Tags'].apply(remove_stopwords)
df_test['Body'] = df_test['Body'].apply(remove_stopwords)
df_test['Title'] = df_test['Title'].apply(remove_stopwords)
df_test['Tags'] = df_test['Tags'].apply(remove_stopwords)
#df_train[['Body','Title','Tags']] = df_train[['Body','Title','Tags']].apply(lambda s: s.remove_stopwords() if type(s)==str else s)#Todo
#df_test[['Body','Title','Tags']] = df_test[['Body','Title','Tags']].apply(lambda s: s.remove_stopwords() if type(s)==str else s)#Todo

df_train['Tags']

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0        []
1        []
2        []
3        []
4        []
         ..
19242    []
19243    []
19244    []
19245    []
19246    []
Name: Tags, Length: 19247, dtype: object

In [21]:
#We use porter stemming 

from nltk.stem import PorterStemmer

porter = PorterStemmer()

def stemmer(document):
    
    stemmed_document = [porter.stem(word) for word in document] #Todo
    
    return stemmed_document

df_train['Body'] = df_train['Body'].apply(stemmer)
df_train['Title'] = df_train['Title'].apply(stemmer)
df_train['Tags'] = df_train['Tags'].apply(stemmer)
df_test['Body'] = df_test['Body'].apply(stemmer)
df_test['Title'] = df_test['Title'].apply(stemmer)
df_test['Tags'] = df_test['Tags'].apply(stemmer)
#df_train[['Body','Title','Tags']] = df_train[['Body','Title','Tags']].apply(lambda s: s.stemmer() if type(s)==str else s) #Todo
#df_test[['Body','Title','Tags']] = df_test[['Body','Title','Tags']].apply(lambda s: s.stemmer() if type(s)==str else s) #Todo

df_train['Tags']

0        []
1        []
2        []
3        []
4        []
         ..
19242    []
19243    []
19244    []
19245    []
19246    []
Name: Tags, Length: 19247, dtype: object

## Let's Check our dataframe

In [10]:
df_train.head(5)

Unnamed: 0,Id,Score,Body,Title,Tags
0,502641,1,"[im, master, student, eec, work, way, toward, ...","[pytorch, tutori, dqn, defin, state, differ]",[]
1,477291,1,"[know, good, question, found, answer, anywher,...","[random, walk, memori]",[]
2,448489,4,"[time, repeat, fold, crossvalid, want, report,...","[statist, report, repeat, crossvalid]",[]
3,487075,0,"[dataset, mm, record, around, featur, class, i...","[binari, classif, imbalanc, data, odd, calibr,...",[]
4,481670,2,"[want, run, regress, one, explanatori, variabl...","[best, summar, likert, data, use, independ, va...",[]


### Q. Treat Three text data independently and merge into one column

In [11]:
#Treat Three types of data independently
#let's define functions that will help this operation

def add_body(document):
    
    added_document = document.add('Body') #Todo
    
    return added_document

def add_title(document):
    
    added_document = document.add('Title') #Todo
    
    return added_document

def add_tags(document):
    
    added_document = document.add('Tags') #Todo
    
    return added_document

In [12]:
#df_train['Body'] = df_train['Body'].apply(lambda s: s.add_body() if type(s)==str else s)
#df_train['Title'] = df_train['Title'].apply(lambda s: s.add_title() if type(s)==str else s)
#df_train['Tags'] = df_train['Tags'].apply(lambda s: s.add_tags() if type(s)==str else s)

#df_test['Body'] = df_test['Body'].apply(lambda s: s.add_body() if type(s)==str else s)
#df_test['Title'] = df_test['Title'].apply(lambda s: s.add_title() if type(s)==str else s)
#df_test['Tags'] = df_test['Tags'].apply(lambda s: s.add_tags() if type(s)==str else s)

df_train['Body'] = df_train['Body'].apply(add_body)
df_train['Title'] = df_train['Title'].apply(add_title)
df_train['Tags'] = df_train['Tags'].apply(add_tags)

df_test['Body'] = df_test['Body'].apply(add_body)
df_test['Title'] = df_test['Title'].apply(add_title)
df_test['Tags'] = df_test['Tags'].apply(add_tags)

AttributeError: 'list' object has no attribute 'add'

In [23]:
#Now we need to merge all those 3 columns into a single column. Implement this below.
#df_train['text'] = df_train[['Body','Title','Tags']].apply(lambda x: pd.DataFrame.join()) #Todo
train_files = [train_Body, train_Title, train_Tags]
test_files = [test_Body, test_Title, test_Tags]
df_train = pd.concat(train_files, axis = 1, join = 'inner')
df_test = pd.concat(test_files, axis = 1, join = 'inner') #Todo

## Let's check our DataFrame

In [None]:
df_train.head(5)

### Q. Detokenize and convert to document term matrices

In [None]:
#Merge Three text column into one column and detokenize

from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.feature_extraction.text import CountVectorizer

text_train = df_train['text'].apply() #Todo: Detokenize your tokenized text data
countvec_train = #Todo: Define your own CountVectorizer here
sparse_dtm_train = #Todo: Fit and Transform your Countvectorizer and return sparse dtm.

In [None]:
#Todo: Do same on the test set.
text_test = df_test['text'].apply()
sparse_dtm_test = 

In [None]:
#Convert the sprase dtm to pandas DataFrame.
dtm_train = #Todo
dtm_test = #Todo

### Q. Change dependent variable to binary variable

In [None]:
#Change 'Score' to a binary variable, which indicates whether the question is good or not.
y_train = #Todo
y_test = #Todo

In [None]:
#Add y_train and y_test to your data frame if it is needed. Drop unnecessary columns
df_train[''] = y_train
df_test[''] = y_test
df_train.drop(columns = [], inplace = True)
df_test.drop(columns = [], inplace = True)

## Let's check our DataFrame


In [None]:
df_train.head(5)

## (b) Please read the instruction carefully in the pdf.

In [None]:
#Create Comparison Table
#These lines are provided for you to help construct a comparison table.
#It is not requred to follow this format. + You need to find ACC, TPR, FPR, PRE for each model that you choose.
comparison_data = {'Baseline':[baseline_acc,baseline_TPR,baseline_FPR, baseline_PRE],
                   'Logistic Regression':[log_acc,log_TPR,log_FPR, log_PRE],
                   'Decision Tree Classifier':[dtc_acc,dtc_TPR,dtc_FPR,dtc_PRE],
                   'Random Forest with CV':[rf_acc,rf_TPR, rf_FPR,rf_PRE],
                  'Linear Discriminant Analysis':[lda_acc,lda_TPR, lda_FPR,lda_PRE]}

comparison_table = pd.DataFrame(data=comparison_data, index=['Accuracy', 'TPR', 'FPR','PRE']).transpose()
comparison_table.style.set_properties(**{'font-size': '12pt',}).set_table_styles([{'selector': 'th', 'props': [('font-size', '10pt')]}])
comparison_table


## Report details of your training procedures and final comparisons on the test set in this cell. Use your best judgment to choose a final model and explain your choice.

## Report Bootstrap Analysis in this cell

### (c)