In [1]:
import numpy as np
import pandas as pd
import nltk
import nltk.stem
import nltk.tokenize
import copy
import sklearn.metrics
import matplotlib.pyplot as plt

In [2]:
def get_post_types_based_data(data, post_types):
    post_type_vocab = {}
    for post_type in post_types:
        filtered_titles = data[data['Post Type'] == post_type]
        all_words = ' '.join(filtered_titles['Title'])
        all_words = all_words.lower()
        tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
        vocabulary = tokenizer.tokenize(all_words)
        lemmatizer = nltk.stem.WordNetLemmatizer()
        lemmatized_vocab = [lemmatizer.lemmatize(w) for w in vocabulary]
        post_type_vocab[post_type] = lemmatized_vocab
    return post_type_vocab

In [138]:
def create_file(file_2018, training_data, post_types, vocab,smoothing):
    
    fdist1 = {}
    size_post_type_vocab = {}
    model_dataframe = []
    post_type_vocab = get_post_types_based_data(training_data, post_types)
    for post_type in post_types:
        size_post_type_vocab[post_type] = len(post_type_vocab[post_type])
        fdist1[post_type] = nltk.FreqDist(post_type_vocab[post_type])
    with open(file_2018, "a",encoding="utf-8") as text_file:
        for index,word in enumerate(vocab):
            add_to_file = "%d  %s"%(index+1, word)
            for post_type in post_types:
                word_ptype = fdist1[post_type]
                freq_word_ptype = word_ptype[word]
                cond_probability = (freq_word_ptype + smoothing) / (size_post_type_vocab[post_type] + (len(vocab) * smoothing))
                add_to_file = "%s  %d  %.10f"%(add_to_file, freq_word_ptype, cond_probability)
            model_dataframe.append(add_to_file.split('  '))
            text_file.write(add_to_file + "\n")
            
    return pd.DataFrame(model_dataframe)

In [98]:
def naive_bayes_classifier(data_dataframe, DataFrameFile,test_data ,vocab, post_types,training_data):    
    
    test_data_post_type = {}
    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    all_sentences = test_data['Title'].tolist()
    test_data_vocab = ' '.join(test_data['Title'])
    lemmatized_all_sentences = [[lemmatizer.lemmatize(word.lower()) for word in tokenizer.tokenize(s)] for s in all_sentences]
    lemmatize_test_words = [lemmatizer.lemmatize(word.lower()) for word in test_data_vocab.split(' ')]
    test_data_df = pd.DataFrame(lemmatized_all_sentences)
    test_data_df = test_data_df.replace([None], 1)
    
#     print(test_data_df.head())
#     print(data_dataframe.head())
    for post_type in post_types:
        test_data_post_type[post_type] = copy.deepcopy(test_data_df)
    
    
    probabilities_post_types = probabilities_of_post_types(training_data)
    
    for word in lemmatize_test_words:
        for index, post_type in enumerate(post_types):
            row = data_dataframe[data_dataframe[1] == word]
            if(len(row) != 0):
                prob = row[(2 * (index+1) +1)]
                test_data_post_type[post_type] = test_data_post_type[post_type].replace(word, prob)
    
    for post_type in post_types:
        post_data = test_data_post_type[post_type]
        post_data = post_data.apply(pd.to_numeric, errors='coerce')
        test_data_post_type[post_type] = post_data.replace([np.nan], 1)
        
    print("Added the probabilities with post type")
    scores_data = np.zeros([len(test_data['Title']), len(post_types)])
    for index, post_type in enumerate(post_types):
        test_data_post_type[post_type] = probabilities_post_types[post_type] + np.sum(np.log10(test_data_post_type[post_type]), axis=1)
        scores_data[:, index] = test_data_post_type[post_type]
    
    best_scores = np.argmax(scores_data, axis=1).astype('str')
    for index, post_type in enumerate(post_types):
        best_scores[best_scores == str(index)] = post_type
#     print(best_scores)
    #create_file("baseline-result.txt", training_data, post_types, vocab)
    
#     for index, post_type in enumerate(post_types):
#         for i in test_data_post_type[post_type].columns:
#             merge_data = pd.merge(test_data_post_type[post_type], data_dataframe, how='left',left_on=[i], right_on=[data_dataframe.keys()[1]])
#             test_data_post_type[post_type][i] = merge_data[merge_data.keys()[(-2)*(len(post_types) - post_types.index(post_type)) + 1]]
    
#     for post_type in post_types:
#         #print(post_type)
#         print('-----------------------')
#         print(test_data_post_type[post_type])
    data=np.column_stack((np.arange(1,len(test_data['Title'])+1), test_data['Title'], best_scores,scores_data,test_data['Post Type'], np.where(best_scores == test_data['Post Type'], "right", "wrong")))
    np.savetxt(DataFrameFile, data, fmt='%s', encoding='utf-8', delimiter='  ')
    return data

In [5]:
def probabilities_of_post_types(data):
    fdist = {}
    probability = {}
    all_words = ' '.join(data['Post Type'])
    all_words = all_words.lower()
    word_list = nltk.word_tokenize(all_words)
    total_post_types = len(word_list)
    fdist = nltk.FreqDist(word_list)
    for word,freq in fdist.items():
        cond_prob = int(freq)/total_post_types
        probability[word] = cond_prob
    return probability

In [90]:
#if __name__ == "__main__":
    # ******** READING FILE **********
print("Reading hn2018_2019 file")
dataFile = pd.read_csv('hn2018_2019.csv')
    
dataFile.drop("Unnamed: 0", axis=1, inplace=True,)
    #print(dataFile)
print("Separating training and testing data")
    # ******* TRAINING DATA **********
training_data = dataFile[dataFile['Created At'].str.contains("2018")]
    
    # ******* TESTING DATA ***********
testing_data = dataFile[dataFile['Created At'].str.contains("2019")]
    
    # ******* POST TYPES **********
post_types=dataFile['Post Type'].unique()
post_types = post_types.tolist()
    #print(post_types)
    
    #probabilities_of_post_types(training_data)
    
    # ******* GENERATING VOCABULARY **********
print("Generating vocabulary")
all_words = ' '.join(training_data['Title'])
    #print(all_words[0:100])
all_words = all_words.lower()
    
    #print(all_words)
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
vocabulary_eighteen = tokenizer.tokenize(all_words)
    
#     vocabulary_eighteen = nltk.word_tokenize(content)
temp_set = set(vocabulary_eighteen)
unique_vocabulary_eighteen = list(temp_set)
lemmatizer = nltk.stem.WordNetLemmatizer()
lemmatized_vocab = [lemmatizer.lemmatize(w) for w in unique_vocabulary_eighteen]
lemmatized_vocab.sort()
    #print(lemmatized_vocab)
print("Creating model-2018 file")
vocab_dataframe = create_file("model-2018.txt", training_data, post_types, lemmatized_vocab,0.5)
naive_bayes_classifier(vocab_dataframe,"baseline-result.txt",testing_data[0:200], lemmatized_vocab,post_types, training_data)
#     print(vocab_dataframe)
#     vocab_smoothing_dataframe = create_file_GradSmoothingValue("model-2018-tt.txt",training_data, post_types, lemmatized_vocab)

    

Reading hn2018_2019 file
Separating training and testing data
Generating vocabulary
Creating model-2018 file
Added the probabilities with post type




In [85]:
#Stopwords
#def removeSoftWords():
stop_words_file=pd.read_csv("Stopwords.txt")
stop_words_file=pd.DataFrame(
np.row_stack([stop_words_file.columns, stop_words_file.values]),
columns=['Stopwords'])
stopwords_list=stop_words_file['Stopwords'].tolist()
#print(stopwords_list)
#stopwords_list=stop_words_file.split("\n")
filtered_data=[w for w in lemmatized_vocab if w not in stopwords_list]
#print(filtered_data)
stopwordsFile = create_file('stopword-model-2018.txt', training_data, post_types, filtered_data)
naivebayesstopwords = naive_bayes_classifier(stopwordsFile, "stopword-result.txt", testing_data[0:200], lemmatized_vocab,post_types, training_data)

#print(stop_words_file)



Added the probabilities with post type




In [86]:
#WordLengthFiltering

filtered_data=[w for w in lemmatized_vocab if len(w)> 2 and len(w) <9]
#print(filtered_data)
stopwordsFile = create_file('wordlength-model.txt', training_data, post_types, filtered_data)
naivebayesstopwords = naive_bayes_classifier(stopwordsFile, "wordlength-result.txt", testing_data[0:200], lemmatized_vocab,post_types, training_data)


Added the probabilities with post type




In [143]:
frequency_list=[1,5,10,15,20]
freq_dist = nltk.FreqDist(lemmatized_vocab)
words_freq = copy.deepcopy(lemmatized_vocab)
accuracy_list = []
for freq in frequency_list:
    words_after_freq = [word for word in words_freq if freq_dist[word] > freq]
    frequencyModel = create_file("frequency-model"+str(freq)+".txt", training_data, post_types, words_after_freq,0.5)
    frequencyResult = naive_bayes_classifier(frequencyModel, "frequency-result"+str(freq)+".txt", testing_data[0:200], lemmatized_vocab,post_types, training_data)
    
    
    #acc=accuracy_score(smoothingResultArrayTemp,smoothingResultArray)
    #accuracy_list.append(acc)
#plt.plot(frequency_list, accuracy_list)


Added the probabilities with post type




KeyError: 1

In [126]:
freqPercentage = [5, 10, 15, 20, 25]
vocabWords = copy.deepcopy(lemmatized_vocab)
freq_data = nltk.FreqDist(lemmatized_vocab)

accuracy_list = []
for freq in freqPercentage[0:1]:
    freq_words = (freq / 100) * len(lemmatized_vocab)
    print(freq_words)
    most_freq_words = freq_data.most_common(int(freq_words))
    filtered_words = [word[0] for word in most_freq_words]
    words = [word for word in vocabWords if word not in filtered_words]
    freqModel = create_file('freq-model.txt', training_data, post_types, words,0.5)
    freqResult = naive_bayes_classifier(freqModel, "freq-result.txt", testing_data[0:200], lemmatized_vocab,post_types, training_data)
    


3840.7000000000003
Added the probabilities with post type




In [124]:
#Smoothing
smoothing_values=[]
accuracy_score=[]
for i in np.arange(0, 1,0.1):
    rounded_smoothing=float(round(i,2))
    smoothing_values.append(rounded_smoothing)
#print(smoothing_values)
for i in smoothing_values:
    smoothingModel = create_file('smoothing-model.txt', training_data, post_types, filtered_data,i)
    smoothingResult = naive_bayes_classifier(smoothingModel, "smoothing-result.txt", testing_data[0:200], lemmatized_vocab,post_types, training_data)
    #print("--------------")
    #print("------------")
    print(type(smoothingResult), type(smoothingResult[:,7]), type(smoothingResult[:,2]))
    acc=sklearn.metrics.accuracy_score(smoothingResult[:,7],smoothingResult[:,2])
    #print(acc)
    accuracy_score.append(acc)
plt.plot(accuracy_score,smoothing_values)
    

Added the probabilities with post type
<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>




Added the probabilities with post type
<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Added the probabilities with post type
<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Added the probabilities with post type
<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Added the probabilities with post type
<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Added the probabilities with post type
<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Added the probabilities with post type
<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>


KeyboardInterrupt: 