In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("judge-1377884607_tweet_product_company.csv", encoding='latin1')

## Data Exploration and Cleaning

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB


In [3]:
#locate and drop null tweets 
df.loc[df['tweet_text'].isna()]
df.dropna(subset='tweet_text', inplace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9092 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9092 non-null   object
dtypes: object(3)
memory usage: 284.1+ KB


In [5]:
df.head(20)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
5,@teachntech00 New iPad Apps For #SpeechTherapy...,,No emotion toward brand or product
7,"#SXSW is just starting, #CTIA is around the co...",Android,Positive emotion
8,Beautifully smart and simple idea RT @madebyma...,iPad or iPhone App,Positive emotion
9,Counting down the days to #sxsw plus strong Ca...,Apple,Positive emotion
10,Excited to meet the @samsungmobileus at #sxsw ...,Android,Positive emotion


In [6]:
#filter out no product
df_products = df.dropna(subset='emotion_in_tweet_is_directed_at')

In [7]:
df_products.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3291 entries, 0 to 9088
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          3291 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  3291 non-null   object
dtypes: object(3)
memory usage: 102.8+ KB


In [8]:
df_products['emotion_in_tweet_is_directed_at'].value_counts()

iPad                               946
Apple                              661
iPad or iPhone App                 470
Google                             430
iPhone                             297
Other Google product or service    293
Android App                         81
Android                             78
Other Apple product or service      35
Name: emotion_in_tweet_is_directed_at, dtype: int64

In [9]:
#add a column to note whether Apple or Google 
def company(product):
    apple_prods = ['ipad', 'apple', 'iphone']
    if any(item in product.lower() for item in apple_prods):
        return 'Apple'
    else:
        return 'Google'


In [10]:
df_products['Company'] = df_products['emotion_in_tweet_is_directed_at'].apply(lambda x: company(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_products['Company'] = df_products['emotion_in_tweet_is_directed_at'].apply(lambda x: company(x))


In [11]:
df_products.head(20)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,Company
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,Apple
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,Apple
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,Apple
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,Apple
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,Google
7,"#SXSW is just starting, #CTIA is around the co...",Android,Positive emotion,Google
8,Beautifully smart and simple idea RT @madebyma...,iPad or iPhone App,Positive emotion,Apple
9,Counting down the days to #sxsw plus strong Ca...,Apple,Positive emotion,Apple
10,Excited to meet the @samsungmobileus at #sxsw ...,Android,Positive emotion,Google
11,Find &amp; Start Impromptu Parties at #SXSW Wi...,Android App,Positive emotion,Google


In [12]:
#simplify column names
df_products.columns = ['text', 'subject', 'emotion', 'company']
#clean 

In [13]:
df_products['emotion'].value_counts()

Positive emotion                      2672
Negative emotion                       519
No emotion toward brand or product      91
I can't tell                             9
Name: emotion, dtype: int64

In [14]:
pd.set_option('display.max_colwidth', None)

In [15]:
#drop columns with no emotion or ambiguous emotion
df_products_cleaned = df_products[(df_products['emotion'] == 'Positive emotion') | (df_products['emotion'] == "Negative emotion")]

In [16]:
df_products_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3191 entries, 0 to 9088
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     3191 non-null   object
 1   subject  3191 non-null   object
 2   emotion  3191 non-null   object
 3   company  3191 non-null   object
dtypes: object(4)
memory usage: 124.6+ KB


In [17]:
df_products_cleaned['emotion'] = df_products_cleaned['emotion'].apply(lambda x: x.split(' ')[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_products_cleaned['emotion'] = df_products_cleaned['emotion'].apply(lambda x: x.split(' ')[0])


In [18]:
#look at split of our data
df_products_cleaned['emotion'].value_counts(normalize=True)
#since our data is imbalanced, we will likely either perform SMOTE or just make sure to stratify on emotion when doing our train test split 

Positive    0.837355
Negative    0.162645
Name: emotion, dtype: float64

In [19]:
#let's examine the RT category. Do we include them? how prevelant are they in our data set? 
df_products_cleaned[df_products_cleaned['text'].str.contains("RT")]
#there are 863 rows with RTs, but I think we can just count them as individual tweets


Unnamed: 0,text,subject,emotion,company
8,Beautifully smart and simple idea RT @madebymany @thenextweb wrote about our #hollergram iPad app for #sxsw! http://bit.ly/ieaVOB,iPad or iPhone App,Positive,Apple
13,Gotta love this #SXSW Google Calendar featuring top parties/ show cases to check out. RT @hamsandwich via @ischafer =&gt;http://bit.ly/aXZwxB,Other Google product or service,Positive,Google
19,Must have #SXSW app! RT @malbonster: Lovely review from Forbes for our SXSW iPad app Holler Gram - http://t.co/g4GZypV,iPad or iPhone App,Positive,Apple
25,"RT @LaurieShook: I'm looking forward to the #SMCDallas pre #SXSW party Wed., and hoping I'll win an #iPad resulting from my shameless promotion. #ChevySMC",iPad,Positive,Apple
26,"RT haha, awesomely rad iPad app by @madebymany http://bit.ly/hTdFim #hollergram #sxsw (via @michaelpiliero)",iPad or iPhone App,Positive,Apple
...,...,...,...,...
8889,Ummmm...awesome? Party? RT djroe Apple store downtown Austin open til Midnight. #sxsw,Apple,Positive,Apple
8905,We see a WHITE IPAD 2! RT @mention Now you know what @mention apartment at #sxsw looks like... NEEDS MORE WIRES {link},iPad,Positive,Apple
8992,"Looks very interesting RT@mention Google to Launch Major New Social Network Called Circles, Possibly Today {link} #sxsw",Other Google product or service,Positive,Google
9025,Absolutely! RT @mention Timely good schtuff from Google (People Finder 4 japan quake) {link} #iQlab #sxsw,Other Google product or service,Positive,Google


## Data Pre-Processing

In [27]:
from nltk.corpus import stopwords
from nltk import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re


In [100]:

#define our preprocessing function
def text_preprocessing(tweet):
    tweet_cleaned = re.sub(r'@\w+', '', tweet)
    tweet_cleaned = re.sub(r'http://\S+', '', tweet_cleaned)
    pattern = "([a-zA-Z\d]+(?:'[a-z\d]+)?)"
    tokenizer = RegexpTokenizer(pattern)
    tokens = tokenizer.tokenize(tweet_cleaned)
    stop_words_list = stopwords.words('english')
    stop_words_list += ['sxsw', 'link', 'rt', 'quot']
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words_list]
    lemmatizer = WordNetLemmatizer()
    lemmed_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmed_tokens




In [62]:
df_products_cleaned['text'][19]

'Must have #SXSW app! RT @malbonster: Lovely review from Forbes for our SXSW iPad app Holler Gram - http://t.co/g4GZypV'

In [101]:
#test on one tweet
tester = df_products_cleaned['text'][19]
test_tokens = text_preprocessing(tester)

test_tokens

['must', 'app', 'lovely', 'review', 'forbes', 'ipad', 'app', 'holler', 'gram']

### Apply the pre-processing function to our dataframe

In [103]:
df_products_cleaned['tokens'] = df_products_cleaned['text'].apply(text_preprocessing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_products_cleaned['tokens'] = df_products_cleaned['text'].apply(text_preprocessing)


In [104]:
df_products_cleaned.head(20)

Unnamed: 0,text,subject,emotion,company,tokens
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative,Apple,"[3g, iphone, 3, hr, tweeting, rise, austin, dead, need, upgrade, plugin, station]"
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive,Apple,"[know, awesome, ipad, iphone, app, likely, appreciate, design, also, they're, giving, free, t]"
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive,Apple,"[wait, ipad, 2, also, sale]"
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative,Apple,"[hope, year's, festival, crashy, year's, iphone, app]"
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Google,Positive,Google,"[great, stuff, fri, marissa, mayer, google, tim, reilly, tech, book, conference, amp, matt, mullenweg, wordpress]"
7,"#SXSW is just starting, #CTIA is around the corner and #googleio is only a hop skip and a jump from there, good time to be an #android fan",Android,Positive,Google,"[starting, ctia, around, corner, googleio, hop, skip, jump, good, time, android, fan]"
8,Beautifully smart and simple idea RT @madebymany @thenextweb wrote about our #hollergram iPad app for #sxsw! http://bit.ly/ieaVOB,iPad or iPhone App,Positive,Apple,"[beautifully, smart, simple, idea, wrote, hollergram, ipad, app]"
9,Counting down the days to #sxsw plus strong Canadian dollar means stock up on Apple gear,Apple,Positive,Apple,"[counting, day, plus, strong, canadian, dollar, mean, stock, apple, gear]"
10,Excited to meet the @samsungmobileus at #sxsw so I can show them my Sprint Galaxy S still running Android 2.1. #fail,Android,Positive,Google,"[excited, meet, show, sprint, galaxy, still, running, android, 2, 1, fail]"
11,Find &amp; Start Impromptu Parties at #SXSW With @HurricaneParty http://bit.ly/gVLrIn I can't wait til the Android app comes out.,Android App,Positive,Google,"[find, amp, start, impromptu, party, can't, wait, til, android, app, come]"


In [105]:
#create a set of unique tokens 
token_set = []
for tokens in df_products_cleaned['tokens'].values:
    for token in tokens: 
        token_set.append(token)

len(token_set)


32060

In [107]:
#we have 5,136 unique lemmed tokens 
len(set(token_set))

5136

In [91]:
df_products_cleaned.groupby(['company', 'subject', 'emotion']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text,tokens
company,subject,emotion,Unnamed: 3_level_1,Unnamed: 4_level_1
Apple,Apple,Negative,95,95
Apple,Apple,Positive,543,543
Apple,Other Apple product or service,Negative,2,2
Apple,Other Apple product or service,Positive,32,32
Apple,iPad,Negative,125,125
Apple,iPad,Positive,793,793
Apple,iPad or iPhone App,Negative,63,63
Apple,iPad or iPhone App,Positive,397,397
Apple,iPhone,Negative,103,103
Apple,iPhone,Positive,184,184


In [108]:
#convert to lemmed token format
def format_lem(tokens):
    return ' '.join(tokens)

df_products_cleaned['formatted_lemmed_text'] = df_products_cleaned['tokens'].apply(format_lem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_products_cleaned['formatted_lemmed_text'] = df_products_cleaned['tokens'].apply(format_lem)


In [109]:
df_products_cleaned.head()

Unnamed: 0,text,subject,emotion,company,tokens,formatted_lemmed_text
0,".@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead! I need to upgrade. Plugin stations at #SXSW.",iPhone,Negative,Apple,"[3g, iphone, 3, hr, tweeting, rise, austin, dead, need, upgrade, plugin, station]",3g iphone 3 hr tweeting rise austin dead need upgrade plugin station
1,"@jessedee Know about @fludapp ? Awesome iPad/iPhone app that you'll likely appreciate for its design. Also, they're giving free Ts at #SXSW",iPad or iPhone App,Positive,Apple,"[know, awesome, ipad, iphone, app, likely, appreciate, design, also, they're, giving, free, t]",know awesome ipad iphone app likely appreciate design also they're giving free t
2,@swonderlin Can not wait for #iPad 2 also. They should sale them down at #SXSW.,iPad,Positive,Apple,"[wait, ipad, 2, also, sale]",wait ipad 2 also sale
3,@sxsw I hope this year's festival isn't as crashy as this year's iPhone app. #sxsw,iPad or iPhone App,Negative,Apple,"[hope, year's, festival, crashy, year's, iphone, app]",hope year's festival crashy year's iphone app
4,"@sxtxstate great stuff on Fri #SXSW: Marissa Mayer (Google), Tim O'Reilly (tech books/conferences) &amp; Matt Mullenweg (Wordpress)",Google,Positive,Google,"[great, stuff, fri, marissa, mayer, google, tim, reilly, tech, book, conference, amp, matt, mullenweg, wordpress]",great stuff fri marissa mayer google tim reilly tech book conference amp matt mullenweg wordpress


## Perform a train-test-split

In [110]:
from sklearn.model_selection import train_test_split

In [114]:
print(df_products_cleaned['emotion'].unique())



['Negative' 'Positive']


In [115]:
target = df_products_cleaned['emotion']
text = df_products_cleaned['formatted_lemmed_text']

text_train, text_test, target_train, target_test = train_test_split(text, target, stratify=target, random_state=42)

In [122]:
target_train.value_counts(normalize=True)

Positive    0.837443
Negative    0.162557
Name: emotion, dtype: float64

In [123]:
target_test.value_counts(normalize=True)

Positive    0.837093
Negative    0.162907
Name: emotion, dtype: float64

### Create bag of words framework with CountVectorizer 

In [121]:
from sklearn.feature_extraction.text import CountVectorizer 

In [136]:
vectorizer = CountVectorizer(min_df=4)
X_train = vectorizer.fit_transform(text_train)

In [137]:
text_train_vec = pd.DataFrame(X_train.toarray(), columns=vectorizer.get_feature_names_out())

In [141]:
text_train_vec.head()

Unnamed: 0,000,10,11,13,15,150,1st,20,2011,24,...,yay,yeah,year,yelp,yes,yet,youtube,yr,zappos,zazzlesxsw
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
