In [1]:
import json
import pandas as pd
import nltk
import string
import re
import csv
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer, LancasterStemmer
nltk.data.path.append("/Users/youqiao/workspace/env/nltk_data")
%matplotlib inline

In [2]:
questions = pd.read_csv('Questions.csv',encoding='latin1')
questions["content"] = questions["Title"] + " " + questions["Body"]
tags = pd.read_csv('Tags.csv', encoding='latin1')
stop = stopwords.words('english')

In [27]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def cleantags(text):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', text)
    return cleantext

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def lemmatize_tokens(tokens, lemmatizer):
    lemmatized = []
    for item in tokens:
        lemmatized.append(lemmatizer.lemmatize(item, pos='v'))
    return lemmatized

def extractwords(tokens):
    new_tokens = []
    for word, pos in nltk.pos_tag(tokens):
        if pos[:2] == 'NN' or pos[:2] == "JJ":
            new_tokens.append(word)
    return new_tokens

def excludestopwords(tokens):
    new_tokens = []
    for word in tokens:
        if word not in stopwords:
            new_tokens.append(word)
    return new_tokens

def tokenize_language(text):
    text = cleantags(text)
    translator = str.maketrans('', '', string.punctuation)
    no_punctuation = text.translate(translator)
    tokens = nltk.word_tokenize(no_punctuation)
    return tokens

def tokenize(text):
    text = cleantags(text)
    translator = str.maketrans('', '', string.punctuation)
    no_punctuation = text.translate(translator)
    tokens = nltk.word_tokenize(no_punctuation)
    lemmatizes = lemmatize_tokens(tokens, lemmatizer)
    return lemmatizes

def tokenizewithnostem(text):
    text = cleantags(text)
    translator = str.maketrans('', '', string.punctuation)
    no_punctuation = text.translate(translator)
    tokens = nltk.word_tokenize(no_punctuation)
    lemmatizes = lemmatize_tokens(tokens, lemmatizer)
    return lemmatizes

def genitemset(row, wordbags):
    itemsets = []
    for word in wordbags:
        if row[word] == 1:
            itemsets.append(word)
    return ",".join(itemsets)


class AssociationRule():
    def __init__(self, wordbag=None):
        self.wordbag = [] if wordbag is None else wordbag
        self.df = questions
        self.df_tfidf = None
        self.vector = None

    def preprocess(self, tag):
        """
        :param tag: str
        :return:
        """
        ids_tag = tags[tags["Tag"] == tag]["Id"]
        self.df = pd.DataFrame(ids_tag)
        self.df = self.df.set_index('Id').join(questions.set_index('Id'))
        print(self.df.shape)

    def tokenizing(self, ngram=1, min_df=0.1, max_df=0.9, tokenizef=None, voc=None):
        tfidfvector = TfidfVectorizer(tokenizer=tokenizef, ngram_range=(1, ngram), min_df=min_df, max_df=max_df, stop_words=stop, vocabulary=voc)
        text_tfidf = tfidfvector.fit_transform(self.df["content"])
        print(text_tfidf.shape)
        self.wordbag = tfidfvector.get_feature_names()
        self.df_tfidf = text_tfidf.toarray()
        self.df_tfidf = pd.DataFrame(self.df_tfidf, columns=self.wordbag)

        for word in self.wordbag:
            self.df_tfidf.ix[self.df_tfidf[word] == 0, word] = 0
            self.df_tfidf.ix[self.df_tfidf[word] != 0, word] = 1
            self.df_tfidf[word] = self.df_tfidf[word].astype(int)

    def genitemset(self):
        self.df["itemset"] = self.df.apply(lambda row: genitemset(row, self.wordbag), axis=1)
        self.df["itemset"].to_csv("stackoverflow.basket", header=False, index=False)

    def genarff(self):
        self.df_tfidf.to_csv("stackoverflow.csv", header=False, index=False)

        arfffile = open("stackoverflow.arff", "w")
        arfffile.write("@relation stackoverflow.data\n\n")
        for s in self.wordbag:
            s1 = s.replace(" ", "_")
            arfffile.write("@attribute " + s1 + " {0, 1}\n")
        arfffile.write("\n")
        arfffile.write("@data\n")
        arfffile.close()

In [17]:
ar1 = AssociationRule(tokenize)

In [18]:
ar1.preprocess("django")

(62818, 6)


In [20]:
ar1.tokenizing(ngram=1, min_df=0.1, max_df=0.9, tokenizef=tokenize, voc=None)

(62818, 82)


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


In [21]:
ar1.genarff()

In [23]:
ar2 = AssociationRule(tokenize)
ar2.preprocess("pandas")
ar2.tokenizing(ngram=1, min_df=0.1, max_df=0.9, tokenizef=tokenize, voc=None)
ar2.genarff()

(26854, 6)
(26854, 92)


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


# Example of extracting those question with tag of django

In [25]:
ar3 = AssociationRule(tokenize)
ar3.preprocess("django")
ar3.tokenizing(ngram=1, min_df=0.1, max_df=0.99, tokenizef=tokenize, voc=None)
ar3.genarff()

(62818, 6)
(62818, 82)


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


# Example of using given dictionary to convert text

In [29]:
languages = {"javascript": 0,"java": 1,
             "php": 2,"css": 3,"ruby": 4,
             "c": 5,"swift": 6,"scala": 7,
             "r": 8,"matlab": 9,"python": 10}
ar4 = AssociationRule()
ar4.tokenizing(ngram=1, min_df=0.1, max_df=0.99, tokenizef=tokenize_language, voc=languages)
ar4.genarff()

(607282, 11)


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
