In [3]:
import csv
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from textblob import TextBlob

In [4]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vemol\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vemol\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
def get_percentage_of_total(item, total):
    return item / total * 100

def get_tweets_text_from_file(file_name):
    #file = open(file_name, newline='') as csvfile
    tweets_text = []

    with open(file_name, "r", newline="", encoding="utf=16") as csvfile:
        reader = csv.reader(csvfile, quotechar="|")
        for row in csvfile:
            row = row.replace("\r\n", "")
            tweets_text.append(row)
    return tweets_text

def clean_text_array(text_array):
    clean_text_array = []
    for text in text_array:
        # clean urls - RegEx found somewhere on StackOverflow
        text = re.sub('(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?\s?', "", text)
        # clean punctuation
        text = "".join([c for c in text if c not in string.punctuation])
        clean_text_array.append(text)
    return clean_text_array

def tokenize_text_array(text_array):
    eng_stopwords = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    tokenized_text_array = []
    tokens = []

    for text in text_array:
        tokens = TweetTokenizer(strip_handles=True, reduce_len=True).tokenize(text)
        tokens = [word.lower() for word in tokens if word not in eng_stopwords and len(word) > 2 and word.isalpha()]
        #tokens = [stemmer.stem(word) for word in tokens]
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        
        tokenized_text_array.append(tokens)
    
    return tokenized_text_array

def sentiment_analysis(token_array):
    positive = 0
    neutral = 0
    negative = 0

    for tweet in token_array:
        analysis = TextBlob(" ".join(tweet))
        sentiment = analysis.sentiment.polarity

        if sentiment > 0:
            positive += 1
        elif sentiment == 0:
            neutral += 1
        else:
            negative += 1

    total_analyzed = positive + neutral + negative
    positive_percentage = get_percentage_of_total(positive, total_analyzed)
    negative_percentage = get_percentage_of_total(negative, total_analyzed)
    neutral_percentage = get_percentage_of_total(neutral, total_analyzed)

    result = [positive, negative, neutral, positive_percentage, negative_percentage, neutral_percentage, total_analyzed]
    return result

def write_sentiment_results_to_csv(output_file_name, results_array, file_names):
    #write to csv
    file = open(output_file_name, "w", newline='', encoding="utf-16")
    with file:
        field_names = ['result for', 'positive count', 'positive percentage', 'negative count', 'negative percentage', 'neutral count', 'neutral percentage', 'total']

        writer = csv.DictWriter(file, fieldnames=field_names)
        writer.writeheader()
        for i in range(len(results_array)):
            writer.writerow(
                {
                    'result for': file_names[i], 
                    'positive count': results_array[i][0], 
                    'positive percentage': round(results_array[i][3], 2),
                    'negative count': results_array[i][1],
                    'negative percentage': round(results_array[i][4], 2),
                    'neutral count': results_array[i][2],
                    'neutral percentage': round(results_array[i][5], 2),
                    'total': results_array[i][6]
                })

def analyze(file_name):
    tweet_array = get_tweets_text_from_file(file_name)
    cleaned_tweet_array = clean_text_array(tweet_array)
    cleaned_tokenized_tweet_array = tokenize_text_array(cleaned_tweet_array)
    sentiment_result = sentiment_analysis(cleaned_tokenized_tweet_array)

    print("{} analyzed".format(file_name))
    return sentiment_result

def do_it_all(file_names):
    sentiment_results = []

    for file_name in file_names:
        result = analyze(file_name)
        sentiment_results.append(result)
    
    write_sentiment_results_to_csv("results.csv", sentiment_results, file_names)
    print("done")

In [10]:
file_names = ["2017-03-28-pepsi.csv", "2017-03-29-pepsi.csv", "2017-03-30-pepsi.csv", "2017-03-31-pepsi.csv", "2017-04-01-pepsi.csv", "2017-04-02-pepsi.csv", "2017-04-03-pepsi.csv", "2017-04-04-pepsi.csv", "2017-04-05-pepsi.csv", "2017-04-06-pepsi.csv", "2017-04-07-pepsi.csv", "2017-04-08-pepsi.csv", "2017-04-09-pepsi.csv", "2017-04-10-pepsi.csv", "2017-04-11-pepsi.csv", "2017-04-12-pepsi.csv", "2017-04-13-pepsi.csv"]

do_it_all(file_names)

2017-03-28-pepsi.csv analyzed
2017-03-29-pepsi.csv analyzed
2017-03-30-pepsi.csv analyzed
2017-03-31-pepsi.csv analyzed
2017-04-01-pepsi.csv analyzed
2017-04-02-pepsi.csv analyzed
2017-04-03-pepsi.csv analyzed
2017-04-04-pepsi.csv analyzed
2017-04-05-pepsi.csv analyzed
2017-04-06-pepsi.csv analyzed
2017-04-07-pepsi.csv analyzed
2017-04-08-pepsi.csv analyzed
2017-04-09-pepsi.csv analyzed
2017-04-10-pepsi.csv analyzed
2017-04-11-pepsi.csv analyzed
2017-04-12-pepsi.csv analyzed
2017-04-13-pepsi.csv analyzed
done
