In [30]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string

In [16]:
Questions = pd.read_csv('pythonquestions/Questions.csv', encoding="ISO-8859-1")
Answers = pd.read_csv('pythonquestions/Answers.csv', encoding="ISO-8859-1")
Tags = pd.read_csv('pythonquestions/Tags.csv', encoding="ISO-8859-1")

Questions.columns = ['QID', 'QuestionUserId', 'QuestionCreateDate', 'QuestionScore', 'QuestionTitle', 'QuestionBody']
Answers.columns = ['AID','AnswerUserId', 'AnswerCreateDate', 'ParentId', 'AnswerScore', 'AnswerBody']
Tags.columns = ['TID', 'Tag']

In [17]:
print(Questions.shape)
Questions.head()

(607282, 6)


Unnamed: 0,QID,QuestionUserId,QuestionCreateDate,QuestionScore,QuestionTitle,QuestionBody
0,469,147.0,2008-08-02T15:11:16Z,21,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...
1,502,147.0,2008-08-02T17:01:58Z,27,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...
2,535,154.0,2008-08-02T18:43:54Z,40,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...
3,594,116.0,2008-08-03T01:15:08Z,25,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...
4,683,199.0,2008-08-03T13:19:16Z,28,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...


In [18]:
print(Answers.shape)
Answers.head()

(987122, 6)


Unnamed: 0,AID,AnswerUserId,AnswerCreateDate,ParentId,AnswerScore,AnswerBody
0,497,50.0,2008-08-02T16:56:53Z,469,4,<p>open up a terminal (Applications-&gt;Utilit...
1,518,153.0,2008-08-02T17:42:28Z,469,2,<p>I haven't been able to find anything that d...
2,536,161.0,2008-08-02T18:49:07Z,502,9,<p>You can use ImageMagick's convert utility f...
3,538,156.0,2008-08-02T18:56:56Z,535,23,<p>One possibility is Hudson. It's written in...
4,541,157.0,2008-08-02T19:06:40Z,535,20,"<p>We run <a href=""http://buildbot.net/trac"">B..."


In [19]:
print(Tags.shape)
Tags.head()

(1885078, 2)


Unnamed: 0,TID,Tag
0,469,python
1,469,osx
2,469,fonts
3,469,photoshop
4,502,python


In [20]:
print(Questions.isnull().sum())
print(Answers.isnull().sum())
print(Tags.isnull().sum())

QID                      0
QuestionUserId        6212
QuestionCreateDate       0
QuestionScore            0
QuestionTitle            0
QuestionBody             0
dtype: int64
AID                    0
AnswerUserId        5367
AnswerCreateDate       0
ParentId               0
AnswerScore            0
AnswerBody             0
dtype: int64
TID      0
Tag    443
dtype: int64


Looks like this data is pretty clean. The only nulls are for users (probably users that have deleted their accounts since posting), and tags (since these are optional).

### Preprocess text fields

In [21]:
# Combine title and body
Questions['QuestionTitleAndBody'] = Questions['QuestionTitle'] + " " + Questions['QuestionBody']

In [22]:
nonCodeTagsRegex = re.compile("<\/?(?!\/?code).*?>")

In [35]:
lem = WordNetLemmatizer()
def lemmatize_text(text):
    text = nonCodeTagsRegex.sub("", text).lower()
    text = re.sub(r'[^\w\s]','',text)
    return [lem.lemmatize(w) for w in word_tokenize(text)]

In [None]:
Questions['QuestionTitleAndBodyLemmatized'] = Questions.QuestionTitleAndBody.apply(lemmatize_text)
Answers['AnswerBodyLemmatized'] = Answers.AnswerBody.apply(lemmatize_text)

In [None]:
print(Questions['QuestionTitleAndBodyLemmatized'][2])
Questions.head()

In [None]:
print(Answers['AnswerBodyLemmatized'][2])
Answers.head()

In [None]:
Questions.to_csv('LemmatizedQuestions.csv', index=False)
Answers.to_csv('LemmatizedAnswers.csv', index=False)

In [44]:
Answers.head()

Unnamed: 0,AID,AnswerUserId,AnswerCreateDate,ParentId,AnswerScore,AnswerBody,AnswerBodyLemmatized
0,497,50.0,2008-08-02T16:56:53Z,469,4,<p>open up a terminal (Applications-&gt;Utilit...,"[open, up, a, terminal, applicationsgtutilitie..."
1,518,153.0,2008-08-02T17:42:28Z,469,2,<p>I haven't been able to find anything that d...,"[i, havent, been, able, to, find, anything, th..."
2,536,161.0,2008-08-02T18:49:07Z,502,9,<p>You can use ImageMagick's convert utility f...,"[you, can, use, imagemagicks, convert, utility..."
3,538,156.0,2008-08-02T18:56:56Z,535,23,<p>One possibility is Hudson. It's written in...,"[one, possibility, is, hudson, it, written, in..."
4,541,157.0,2008-08-02T19:06:40Z,535,20,"<p>We run <a href=""http://buildbot.net/trac"">B...","[we, run, buildbot, trac, at, work, i, havent,..."
