In [1]:
import pandas as pd
import os

import nltk

In [2]:
from google.colab import drive 

In [3]:
drive.mount('colab', force_remount=True)

Mounted at colab


In [4]:
%cd "colab/MyDrive/NE-INT6940"
!pwd

/content/colab/MyDrive/NE-INT6940
/content/colab/MyDrive/NE-INT6940


In [5]:
# !git clone https://gitlab.com/sajucrajan/repurpost.git

In [6]:
!pwd
!ls -ltr

/content/colab/MyDrive/NE-INT6940
total 4
drwx------ 2 root root 4096 Oct 24 18:04 repurpost


In [7]:
df_questions = pd.read_csv("./repurpost/datasets/raw/Questions.csv", encoding="ISO-8859-1")
df_questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,ClosedDate,Score,Title,Body
0,80,26.0,2008-08-01T13:57:07Z,,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,58.0,2008-08-01T14:41:24Z,2012-12-26T03:45:49Z,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,83.0,2008-08-01T15:50:08Z,,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,2089740.0,2008-08-01T18:42:19Z,,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,91.0,2008-08-01T23:22:08Z,,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...


In [8]:
df_questions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1264216 entries, 0 to 1264215
Data columns (total 7 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   Id            1264216 non-null  int64  
 1   OwnerUserId   1249762 non-null  float64
 2   CreationDate  1264216 non-null  object 
 3   ClosedDate    55959 non-null    object 
 4   Score         1264216 non-null  int64  
 5   Title         1264216 non-null  object 
 6   Body          1264216 non-null  object 
dtypes: float64(1), int64(2), object(4)
memory usage: 67.5+ MB


In [9]:
# Dropping columns that are not necessary for repurpost project
df_questions.drop(columns=['OwnerUserId', 'CreationDate', 'ClosedDate'], inplace=True)
df_questions.head()

Unnamed: 0,Id,Score,Title,Body
0,80,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...
1,90,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...
2,120,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...
3,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...
4,260,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...


In [10]:
df_tags = pd.read_csv("./repurpost/datasets/raw/Tags.csv", encoding="ISO-8859-1")
df_tags.head()

Unnamed: 0,Id,Tag
0,80,flex
1,80,actionscript-3
2,80,air
3,90,svn
4,90,tortoisesvn


In [11]:
df_tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3750994 entries, 0 to 3750993
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   Id      int64 
 1   Tag     object
dtypes: int64(1), object(1)
memory usage: 57.2+ MB


In [12]:
# The same id is having more than one tag
# to combine this data with the questions, we need to combine the all the tags belonging to an id
# and then join with questions dataframe to avoid cartesion product of the join

# Convert Tag column datatype to string for grouping
df_tags['Tag'] = df_tags['Tag'].astype(str)

# Combining the tags for a particular Id separated by pipe '|' delimiter
df_tags_grouped = pd.DataFrame(df_tags.groupby('Id')['Tag'].apply(lambda tag: '|'.join(tag))).reset_index()
df_tags_grouped.head()

Unnamed: 0,Id,Tag
0,80,flex|actionscript-3|air
1,90,svn|tortoisesvn|branch|branching-and-merging
2,120,sql|asp.net|sitemap
3,180,algorithm|language-agnostic|colors|color-space
4,260,c#|.net|scripting|compiler-construction


In [13]:
# Check the rows and columns of df_tags_grouped dataframe
df_tags_grouped.shape

(1264216, 2)

In [14]:
# Check the rows and columns of df_question dataframe
df_questions.shape

(1264216, 4)

In [15]:
# Combine the questions and tags for next steps
df_stackoverflow = df_questions.merge(df_tags_grouped, on='Id')
df_stackoverflow.head()



Unnamed: 0,Id,Score,Title,Body,Tag
0,80,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,flex|actionscript-3|air
1,90,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,svn|tortoisesvn|branch|branching-and-merging
2,120,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,sql|asp.net|sitemap
3,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,algorithm|language-agnostic|colors|color-space
4,260,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,c#|.net|scripting|compiler-construction


In [16]:
del df_questions
del df_tags
del df_tags_grouped

In [17]:
df_stackoverflow['Tags'] = df_stackoverflow['Tag'].apply(lambda x: x.split('|'))
df_stackoverflow.head()

Unnamed: 0,Id,Score,Title,Body,Tag,Tags
0,80,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,flex|actionscript-3|air,"[flex, actionscript-3, air]"
1,90,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,svn|tortoisesvn|branch|branching-and-merging,"[svn, tortoisesvn, branch, branching-and-merging]"
2,120,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,sql|asp.net|sitemap,"[sql, asp.net, sitemap]"
3,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,algorithm|language-agnostic|colors|color-space,"[algorithm, language-agnostic, colors, color-s..."
4,260,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,c#|.net|scripting|compiler-construction,"[c#, .net, scripting, compiler-construction]"


In [18]:
tags_list = [tag for tags in df_stackoverflow[df_stackoverflow['Score'] > 5]['Tags'].values for tag in tags]

In [19]:
tag_counts = nltk.FreqDist(tags_list)
top_1000_tags = [tag[0] for tag in tag_counts.most_common(1000)]
top_1000_tags[0:20]

['c#',
 'java',
 'javascript',
 'android',
 'python',
 'c++',
 'php',
 'jquery',
 '.net',
 'ios',
 'html',
 'css',
 'c',
 'iphone',
 'objective-c',
 'ruby-on-rails',
 'sql',
 'asp.net',
 'mysql',
 'ruby']

In [20]:
# Define function to take list of tags as input and create a new list only based
# on tags that are part of the top 1000 tags list
def keep_top_tags(tags):
  tags_list = []
  for tag in tags:
    if tag in top_1000_tags:
      tags_list.append(tag)

  if (len(tags_list) > 0):
    return tags_list
  else:
    return None

In [21]:
df_stackoverflow['Tags'] = df_stackoverflow['Tags'].apply(lambda tags: keep_top_tags(tags))

In [22]:
df_stackoverflow.head()

Unnamed: 0,Id,Score,Title,Body,Tag,Tags
0,80,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,flex|actionscript-3|air,"[flex, actionscript-3]"
1,90,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,svn|tortoisesvn|branch|branching-and-merging,"[svn, tortoisesvn, branch]"
2,120,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,sql|asp.net|sitemap,"[sql, asp.net]"
3,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,algorithm|language-agnostic|colors|color-space,"[algorithm, language-agnostic, colors]"
4,260,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,c#|.net|scripting|compiler-construction,"[c#, .net, scripting, compiler-construction]"


In [23]:
df_stackoverflow = df_stackoverflow[df_stackoverflow['Score'] > 5].dropna()

In [24]:
# Drop extra columns.
df_stackoverflow.drop(columns=['Tag'], inplace=True)
df_stackoverflow.head()

Unnamed: 0,Id,Score,Title,Body,Tags
0,80,26,SQLStatement.execute() - multiple queries in o...,<p>I've written a database generation script i...,"[flex, actionscript-3]"
1,90,144,Good branching and merging tutorials for Torto...,<p>Are there any really good tutorials explain...,"[svn, tortoisesvn, branch]"
2,120,21,ASP.NET Site Maps,<p>Has anyone got experience creating <strong>...,"[sql, asp.net]"
3,180,53,Function for creating color wheels,<p>This is something I've pseudo-solved many t...,"[algorithm, language-agnostic, colors]"
4,260,49,Adding scripting functionality to .NET applica...,<p>I have a little game written in C#. It uses...,"[c#, .net, scripting, compiler-construction]"


In [25]:
df_stackoverflow.shape

(71237, 5)


Clean up Data

In [26]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
from bs4 import BeautifulSoup
import re

from nltk.corpus import stopwords
from nltk.tokenize import ToktokTokenizer
from datetime import datetime

stop_words=set(stopwords.words('english'))
tokenizer = ToktokTokenizer()
punctuations = '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~'

In [1]:
count = 0
regex = re.compile('[%s]' % re.escape(punctuations))

def clean_stopwords(text):
  words = tokenizer.tokenize(text)
  clean_text = []
  clean_text = [word for word in words if not word in stop_words]
  del words
  return " ".join(clean_text)

def clean_all_punctuations(text):
  global regex
  return regex.sub('', text)

def clean_punctuations(text):
  global count
  global regex
  words = tokenizer.tokenize(text)
  clean_text = []
  # [clean_text.append(word) if word in tags_list else clean_text.append(regex.sub('', word)) for word in words]
  
  for word in excerpt:
    if word in tags_list:
      i=0
      clean_text.append(word)
    else:
      j=0
    clean_text.append(regex.sub('', word))


  count = count + 1
  if (count % 500 == 0):
    print(count)
  return " ".join(clean_text)

NameError: ignored

In [29]:
test = ['This is a test?',
        'This is really not any test that I want to take today!']

for t in test:
  clean_punctuations(clean_stopwords(t))

# print(clean_punctionations(clean_stopwords("I ?\]s34a-2=32+ written database generation script SQL want ! ? a-@")))

In [30]:
# Remove the HTML tags from the 'Body' column
df_stackoverflow['Body'] = df_stackoverflow['Body'].apply(lambda text: BeautifulSoup(text).get_text()) 

In [31]:
# Remove english stop words from both Title and Body
df_stackoverflow['Title'] = df_stackoverflow['Title'].apply(lambda text: clean_stopwords(text))
df_stackoverflow['Body'] = df_stackoverflow['Body'].apply(lambda text: clean_stopwords(text))

In [32]:
# Remove punctuations from Title and Body only if they are not part of the tags list
df_stackoverflow['Title-Clean'] = df_stackoverflow['Title'].apply(lambda text: clean_punctuations(text))
df_stackoverflow['Body-Clean'] = df_stackoverflow['Body'].apply(lambda text: clean_punctuations(text))