In [21]:
import pandas as pd
import numpy as np

import shutil

import sys

import os
from os import listdir
from os.path import isfile, join

from datetime import datetime

import re
import string

from textblob import TextBlob

import nltk
from nltk import pos_tag, pos_tag_sents
from nltk.tokenize import TweetTokenizer 
tokenizer = TweetTokenizer()

from nltk.corpus import stopwords

additional_stops = ['apple', 'm1', 'mac', 'new', 'rt', 'get', 'go', 'one', 'even', 'would',
'macs', 'make', 'want', 'yes', 'really', 'could', 'say', 'lot', 'via', 'something', 'right',
'since', 'give', 'hackintosh', 'ago', 'hi', 'ask', 'bo', 'probably', 'put', 'end', 'might', 
'around' 'us', 'happen', 'kill', 'use', 'mini', 'macbook']

stopwords_list = stopwords.words('english')
stopwords_list += list(string.punctuation + string.digits)
stopwords_list += additional_stops

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

import GeneralFunctions


def retrieve_batch(source_dir, ignore):
    '''Retrieves a list of csv file names from source_dir(string) while ignoring
    files named in ignore(list). The resulting list of files is read in and 
    concatenated into a larger data frame, or batch. Returns file_names(list), and
    batch (pd.DataFrame).'''
    # get all the files from the directory
    file_names = [f for f in listdir(source_dir) if isfile(join(source_dir, f))]
    file_names = [f for f in file_names if f not in ignore]
    batch_name = ' '.join(str(datetime.now()).split(' '))[0:19].replace(':', '_').replace(' ','_')+'.csv'


    # ignore secret files, establish a batch df, load each csv file, concatenate to batch, save batch
    batch = pd.DataFrame()
    for file_name in file_names:
        data = pd.read_csv(source_dir + '/' + file_name)
        print('[*] Adding ', file_name, 'to batch: ', batch_name)
        batch = pd.concat([batch, data], axis=0)
        
    return batch


def clean_text(text, tokenizer):
    '''Accepts a lemmatizer (object), text (string), stopwords (list).
    Tokenizes the text and removes usernames (^'@.*'), hashtags ('^#.*'),
    and web addresses ('^http'). Then it lemmatizes the text and normalizes
    to lowercase. Returns a list of modified text tokens.'''
    
    text = tokenizer.tokenize(text)
    text = [t for t in text if t.isalpha()]
    text = [t.lower() for t in text]
    remove = [t for t in text if t.startswith('@') or t.startswith('http') or t.startswith('www') or t.startswith('#')]
    tokens = [t.replace('\n', '') for t in text if t not in remove]
    
    return ' '.join(tokens)


def process_text(text, stopwords_list, lemmatizer):
    
    text = text.split(' ')    
    tokens = [t for t in text if t not in stopwords_list]
    
    return [lemmatizer.lemmatize(token, pos='v') for token in tokens]


def label_subjectivity(df):
    
    df['scores'] = df['cleaned_text'].apply(lambda x: TextBlob(x).sentiment)
    df['subjectivity'] = df.scores.apply(lambda x: x[1])
    df = df.drop('scores', axis=1)
    
    return df


def label_polarity(df, analyzer):

    df['scores'] = df.cleaned_text.apply(lambda x: 0 if type(x) == float else analyzer.polarity_scores(x))
    df['pos'] = df.scores.apply(lambda x: x['pos'])
    df['neg'] = df.scores.apply(lambda x: x['neg'])
    df['neu'] = df.scores.apply(lambda x: x['neu'])
    
    df['com'] = df.scores.apply(lambda x: x['compound'])
    
    df['polarity'] = df.com.apply(lambda x: 1 if x > 0 else x)
    df['polarity'] = df.polarity.apply(lambda x: -1 if x < 0 else x)
    df = df.drop(['scores', 'pos', 'neg', 'neu'], axis=1)
    
    return df


def count_tags(text, tag):
    return [t[1] for t in text if t[1] == tag]


def pos_tag_data(df):
    
    df['pos_tags'] = pos_tag_sents(df['cleaned_text'].tolist())

    all_tags = []
    for tag in df.pos_tags:
        all_tags += [t[1] for t in tag if t[1] not in all_tags]

    for tag in all_tags:
        df[tag] = df.pos_tags.apply(lambda x: count_tags(x, tag)).apply(lambda x: len(x))
        
    return df


def process_data(platform, df, tokenizer, stopwords_list, lemmatizer, analyzer):
    
    print('\n[*] Intiating text cleaning...')
    print('-- Tokenizing...')
    print('-- Removing non-alphabetic characters...')
    print('-- Converting to lowercase...')
    print('-- Removing hashtags, web addresses, and mentions...')
    print('-- Removing any code tags...')
    print('-- Joining tokens for further processing...')
   
    df = pd.DataFrame(df.text)

    df['text'] = df.text.apply(lambda x: str(x))
    df['cleaned_text'] = df['text'].apply(lambda x: clean_text(x, tokenizer))
    
    df['lens'] = df.cleaned_text.apply(lambda x: len(x))
    df = df[df.lens > 0]
    
    print('\n[*] Labeling subjectivity...')
    print('-- Calculating subjectivity scores...')
    df = label_subjectivity(df)
    
    print('\n[*] Labeling polarity...')
    print('-- Calculating polarity scores...')
    print('-- Determining polarity label...')
    df = label_polarity(df, analyzer)
    
    print('\n[*] Processing text...')
    print('-- Removing stopwords...')
    print('-- Lemmatizing tokens...')
    
    df['cleaned_text'] = df.cleaned_text.apply(lambda x: process_text(x, stopwords_list, lemmatizer))
    
    print('\n[*] Calculating text length...')
    df['text_len'] = df.text.apply(lambda x: len(x.split(' ')))
    
    print('\n[*] Applying POS tags...')
    print('-- Obtaining POS tags...')
    print('-- Creating POS tag list...')
    print('-- Counting POS tags...')
    df = pos_tag_data(df)
    
    print('\n[*] Preprocessing Complete')
    
    return df


def batch_and_process_data(platform):
    
    GeneralFunctions.create_banner('Preprocess Data')
    print('\n[*] Obtaining data...\n')
    
    ignore = ['.DS_Store']
    twitter_source_dir = '/Users/christineegan/AppleM1SentimentAnalysis/data/tweet_data/raw_data/'
    twitter_target_dir = '/Users/christineegan/AppleM1SentimentAnalysis/data/tweet_data/labeled_data/'
    reddit_source_dir = '/Users/christineegan/AppleM1SentimentAnalysis/data/reddit_data/raw_data/session_data/'
    reddit_target_dir = '/Users/christineegan/AppleM1SentimentAnalysis/data/reddit_data/labeled_data/'

    if platform == 'Twitter':
        print('\n[*] Retrieving session data from source directory...\n')
        filenames, raw_data, batch_name = retrieve_batch(twitter_source_dir, ignore)
        raw_data = raw_data.drop_duplicates(subset='user')
        target_dir = twitter_target_dir

    elif platform == 'Reddit':
       
        print('\n[*] Retrieving session data from source directory...\n')
        filenames, raw_data, batch_name = retrieve_batch(reddit_source_dir, ignore)
        raw_data = raw_data.drop_duplicates(subset='text')
        target_dir = reddit_target_dir
        
    else:
        print('\n[*] Retrieving session data from source directory...\n')
        twitter_raw_data = retrieve_batch(twitter_source_dir, ignore)
        twitter_raw_data.head()
        twitter_raw_data = twitter_raw_data.drop_duplicates(subset='user')

        reddit_raw_data = retrieve_batch(reddit_source_dir, ignore)
        reddit_raw_data.head()
        reddit_raw_data = reddit_raw_data.drop_duplicates(subset='text')
        
        raw_data = pd.concat([twitter_raw_data, reddit_raw_data], axis=0)
        
        batch_name = ' '.join(str(datetime.now()).split(' '))[0:19].replace(':', '_').replace(' ','_')+'.csv'  
        target_dir = '/Users/christineegan/AppleM1SentimentAnalysis/data/combined_data/'
        
    print('\n[*] Preprocessing batch:', batch_name)
    data = process_data(platform, raw_data, tokenizer, stopwords_list, lemmatizer, analyzer)
    
    filename = ' '.join(str(datetime.now()).split(' '))[0:19].replace(':', '_').replace(' ','_')+'.csv'
    csv_name = target_dir + filename
    
    print('\n[*] Saving processed results to ', csv_name)
    data.to_csv(csv_name, index=False)
        
    return data

In [22]:
data = batch_and_process_data('both')


Preprocess Data
------------------------------

[*] Obtaining data...


[*] Retrieving session data from source directory...

[*] Adding  2021-01-28_12_47_40.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-02-01_20_11_54.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-28_12_08_25.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-28_12_08_19.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-28_12_32_19.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-30_21_56_58.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-28_12_10_45.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-28_12_08_18.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-30_21_30_40.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-25_14_30_40.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-28_12_08_24.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-28_11_53_00.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-3

[*] Adding  2021-02-01_19_58_06.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-30_21_17_32.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-28_11_58_03.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-28_17_01_05.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-30_21_48_52.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-25_14_35_52.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-30_20_56_19.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-30_21_39_46.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-30_21_23_36.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-28_12_08_12.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-30_21_54_57.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-28_12_44_36.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-28_12_08_16.csv to batch:  2021-02-02_01_14_30.csv
[*] Adding  2021-01-30_20_54_18.csv to batch:  2021-02-02_01_14_30.csv
[*] Ad

In [23]:
df = pd.read_csv('/Users/christineegan/AppleM1SentimentAnalysis/data/combined_data/2021-02-02_01_14_46.csv')

In [24]:
df

Unnamed: 0,text,cleaned_text,lens,subjectivity,com,polarity,text_len,pos_tags,NN,JJ,...,RBS,NNP,RP,WP,PRP,WDT,WRB,WP$,TO,''
0,With the Apple M1 chip on MacBook Air and MacB...,"['chip', 'air', 'pro', 'students', 'cpu', 'inc...",98,0.900000,0.0000,0.0,21,"[('chip', 'NN'), ('air', 'NN'), ('pro', 'JJ'),...",4,2,...,0,0,0,0,0,0,0,0,0,0
1,RT @saraali16413422: #ParkJimin #KimNamjoon #S...,"['best', 'laptop', 'chip', 'giant', 'leap', 'p...",53,0.650000,0.6369,1.0,20,"[('best', 'JJS'), ('laptop', 'JJ'), ('chip', '...",3,1,...,0,0,0,0,0,0,0,0,0,0
2,RT @saraali16413422: #ParkJimin #KimNamjoon #S...,"['best', 'laptop', 'chip', 'giant', 'leap', 'p...",53,0.650000,0.6369,1.0,20,"[('best', 'JJS'), ('laptop', 'JJ'), ('chip', '...",3,1,...,0,0,0,0,0,0,0,0,0,0
3,RT @Bisma22148197: #ParkJimin #KimNamjoon #SUP...,"['best', 'laptop', 'chip', 'giant', 'leap', 'p...",56,0.650000,0.6369,1.0,20,"[('best', 'JJS'), ('laptop', 'JJ'), ('chip', '...",2,3,...,0,0,0,0,0,0,0,0,0,0
4,Hey! New #podcast Basics of #computer architec...,"['hey', 'basics', 'architecture', 'feature', '...",76,0.527273,0.4404,1.0,19,"[('hey', 'NN'), ('basics', 'NNS'), ('architect...",2,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2101,While I appreciate Apple’s fight for user priv...,"['appreciate', 'fight', 'user', 'privacy', 'wi...",261,0.335658,0.5719,1.0,46,"[('appreciate', 'NN'), ('fight', 'NN'), ('user...",12,6,...,0,0,0,0,0,0,0,0,0,0
2102,Apple should make this statement at a serious ...,"['statement', 'serious', 'security', 'conventi...",149,0.833333,0.7184,1.0,28,"[('statement', 'NN'), ('serious', 'JJ'), ('sec...",8,1,...,0,0,0,0,0,0,0,0,0,0
2103,Committed to privacy but yet bans a free speec...,"['commit', 'privacy', 'yet', 'ban', 'free', 's...",114,0.522222,0.8658,1.0,22,"[('commit', 'NN'), ('privacy', 'NN'), ('yet', ...",7,3,...,0,0,0,0,0,0,0,0,0,0
2104,7zip file how do I open a 7 zip file without k...,"['file', 'open', 'zip', 'file', 'without', 'kn...",125,0.500000,0.0000,0.0,28,"[('file', 'NN'), ('open', 'JJ'), ('zip', 'NN')...",9,1,...,0,0,0,0,0,0,0,0,0,0
