In [137]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

from nltk import tokenize
from nltk.test.classify_fixt import setup_module
from sklearn.model_selection import train_test_split
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.classify import *

In [138]:
# Reads in df
df = pd.read_csv("twitter_data.csv",encoding="ISO-8859-1",names=["target", "ids", "date", "flag", "user", "text"])

In [139]:
# Cleans df
def clean_df():
    df = pd.read_csv("twitter_data.csv",encoding="ISO-8859-1",names=["target", "ids", "date", "flag", "user", "text"])
    df = df.dropna()
    del df["flag"]
    del df["user"]
    del df["date"]
    if (len(df["target"].unique()) == 1):
        del df["target"]
    df = df.replace(4, 1)
    df["split_text"] = df["text"].str.split()
    df["final_text"] = list(zip(df["split_text"], df["target"]))
    from nltk.corpus import stopwords
    stops = set(stopwords.words('english'))
    new_list = []
    for sentence in df["final_text"].tolist():
        temp_list = []
        for word in sentence[0]:
            if word not in stops:
                if word[0] != '@':
                    if ":" not in word:
                        if "/" not in word:
                            temp_list.append(word)
        new_list.append((temp_list,sentence[1]))
    df["final_text"] = new_list
    return df

In [161]:
# Runs the model, returns the accuracy
def run_model(df, classifier, proportion, min_Freq):
    df = df.sample(frac = proportion)
    # values of original dataframe
    train_df = df.sample(frac = 0.8)
    # Creating dataframe with
    # rest of the 20% values
    test_df = df.drop(train_df.index)
    sentim_analyzer = SentimentAnalyzer() # Set up model
    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in train_df["final_text"].tolist()])
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=min_Freq)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
    training_set = sentim_analyzer.apply_features(train_df["final_text"].tolist())
    test_set = sentim_analyzer.apply_features(test_df["final_text"].tolist())
    trainer = classifier.train
    classifier = sentim_analyzer.train(trainer, training_set)
    for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
        print('{0}: {1}'.format(key, value))
    return sentim_analyzer.evaluate(test_set)['Accuracy']

In [159]:
def run_all(classifier, proportion, min_Freq):
    df = pd.read_csv("twitter_data.csv",encoding="ISO-8859-1",names=["target", "ids", "date", "flag", "user", "text"])
    df = df.dropna()
    del df["flag"]
    del df["user"]
    del df["date"]
    if (len(df["target"].unique()) == 1):
        del df["target"]
    df = df.replace(4, 1)
    df["split_text"] = df["text"].str.split()
    df["final_text"] = list(zip(df["split_text"], df["target"]))
    from nltk.corpus import stopwords
    stops = set(stopwords.words('english'))
    new_list = []
    for sentence in df["final_text"].tolist():
        temp_list = []
        for word in sentence[0]:
            if word not in stops:
                if word[0] != '@':
                    if ":" not in word:
                        if "/" not in word:
                            temp_list.append(word)
        new_list.append((temp_list,sentence[1]))
    df["final_text"] = new_list
    df = df.sample(frac = proportion)
    # values of original dataframe
    train_df = df.sample(frac = 0.8)
    # Creating dataframe with
    # rest of the 20% values
    test_df = df.drop(train_df.index)
    sentim_analyzer = SentimentAnalyzer() # Set up model
    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in train_df["final_text"].tolist()])
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=min_Freq)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
    training_set = sentim_analyzer.apply_features(train_df["final_text"].tolist())
    test_set = sentim_analyzer.apply_features(test_df["final_text"].tolist())
    trainer = classifier.train
    classifier = sentim_analyzer.train(trainer, training_set)
    for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
        print('{0}: {1}'.format(key, value))

In [142]:
df = clean_df()

In [162]:
x = run_model(df, NaiveBayesClassifier, 0.02, 2000)
print(x)

Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy: 0.54
F-measure [0]: 0.3791649093209616
F-measure [1]: 0.6346487962273517
Precision [0]: 0.571882951653944
Precision [1]: 0.5296188898094449
Recall [0]: 0.283596214511041
Recall [1]: 0.791640866873065
Evaluating NaiveBayesClassifier results...
0.54


In [None]:
percent_data = [0.05, .1, .15, .2, .25, .3]
min_Freqs = [500, 750, 1000, 1500, 2000]
results = []

for p in percent_data:
    for m in min_Freqs:
        results.append(((p, m), run_model(df, NaiveBayesClassifier, p, m)))

print(results)

Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy: 0.627375
F-measure [0]: 0.5956869659568697
F-measure [1]: 0.6544569375217341
Precision [0]: 0.6512455516014235
Precision [1]: 0.6099827139152982
Recall [0]: 0.548862784303924
Recall [1]: 0.705926481620405
Evaluating NaiveBayesClassifier results...
Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy: 0.5983125
F-measure [0]: 0.5548244095033594
F-measure [1]: 0.6340602402778568
Precision [0]: 0.6170081651517486
Precision [1]: 0.5855505310758229
Recall [0]: 0.5040271834885477
Recall [1]: 0.6913334988825428
Evaluating NaiveBayesClassifier results...
Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy: 0.597625
F-measure [0]: 0.5523571130579892
F-measure [1]: 0.6345782722215916
Precision [0]: 0.6288790373654212
Precision [1]: 0.577240809582817
Recall [0]: 0.49243739151996035
Recall [1]: 0.7045626417948072
Evaluating NaiveBayesClassifier results...
Training classifier
Evaluating N