In [1]:
import os 
import dotenv
from pathlib import Path

In [2]:
project_dir = Path(os.path.abspath("")).resolve().parents[1]
project_dir

PosixPath('/Users/kinara/Scotland/Talks/AllDataId/ml-in-prod')

In [3]:
import pandas as pd
import numpy as np

In [4]:
# read raw twitter dataset

In [5]:
raw_df = pd.read_csv(os.path.join(project_dir, "data", "raw", "Tweets.csv"))

In [6]:
raw_df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [7]:
raw_df.shape

(14640, 15)

In [8]:
# remove duplicates 

In [9]:
raw_df.drop_duplicates(keep="first", inplace=True)

In [10]:
raw_df.shape

(14604, 15)

In [11]:
raw_df = raw_df[["tweet_id", "airline_sentiment", "negativereason", "airline", "text"]]

In [12]:
raw_df.head()

Unnamed: 0,tweet_id,airline_sentiment,negativereason,airline,text
0,570306133677760513,neutral,,Virgin America,@VirginAmerica What @dhepburn said.
1,570301130888122368,positive,,Virgin America,@VirginAmerica plus you've added commercials t...
2,570301083672813571,neutral,,Virgin America,@VirginAmerica I didn't today... Must mean I n...
3,570301031407624196,negative,Bad Flight,Virgin America,@VirginAmerica it's really aggressive to blast...
4,570300817074462722,negative,Can't Tell,Virgin America,@VirginAmerica and it's a really big bad thing...


In [13]:
# preprocessing text feature

In [14]:
# replace airline names
raw_df["text"] = raw_df["text"].str.replace("@VirginAmerica", "")
raw_df["text"] = raw_df["text"].str.replace("@united", "")
raw_df["text"] = raw_df["text"].str.replace("@SouthwestAir", "")
raw_df["text"] = raw_df["text"].str.replace("@JetBlue", "")
raw_df["text"] = raw_df["text"].str.replace("@AmericanAir", "")
raw_df["text"] = raw_df["text"].str.replace("@USAirways", "")

In [15]:
# normalize short form

In [16]:
import re
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [17]:
raw_df["text"] = raw_df["text"].apply(decontracted)

In [18]:
def preprocess_text(sent):
    sent = sent.replace('\\r', ' ')
    sent = sent.replace('\\n', ' ')
    sent = sent.replace('\\"', ' ')
    sent = sent.replace('http"', ' ') 
    sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
    return sent
    

In [19]:
raw_df["text"] = raw_df["text"].apply(preprocess_text)

In [20]:
raw_df["airline_sentiment"] = raw_df["airline_sentiment"].str.upper() 

In [21]:
pd.set_option('display.max_colwidth', 0)

In [22]:
raw_df.head()

Unnamed: 0,tweet_id,airline_sentiment,negativereason,airline,text
0,570306133677760513,NEUTRAL,,Virgin America,What dhepburn said
1,570301130888122368,POSITIVE,,Virgin America,plus you have added commercials to the experience tacky
2,570301083672813571,NEUTRAL,,Virgin America,I did not today Must mean I need to take another trip
3,570301031407624196,NEGATIVE,Bad Flight,Virgin America,it is really aggressive to blast obnoxious entertainment in your guests faces amp they have little recourse
4,570300817074462722,NEGATIVE,Can't Tell,Virgin America,and it is a really big bad thing about it


In [23]:
raw_df.shape

(14604, 5)

In [None]:
# get sampling 50%

In [31]:
sample_df = raw_df.groupby(["airline_sentiment", "airline"], group_keys=False).apply(lambda x: x.sample(frac=0.3))

In [32]:
sample_df.shape

(4380, 5)

In [33]:
sample_df.head()

Unnamed: 0,tweet_id,airline_sentiment,negativereason,airline,text
12352,570222239603273729,NEGATIVE,Can't Tell,American,do not merge with an airline that ai not ready for prime time and book your elite flyers on it NotHappy NeedCoffee
12378,570213186139525120,NEGATIVE,Flight Attendant Complaints,American,thanks for making the worst fly experience ever Will never book again with your airline Train your flight attendants better
13439,569864610016321536,NEGATIVE,Customer Service Issue,American,Filed a PIR with an agent at Indianapolis airport Unable to pull up information online on your site Have DMed details
13492,569850083140882432,NEGATIVE,Lost Luggage,American,I am called Paris office this morning again still waiting It is in Miami but apparently tag was taken off
13243,569907965223763970,NEGATIVE,Customer Service Issue,American,thx for showing me that your Twitter appreciates me more than your employees Sure another airline would like my 1300 nohotel


In [34]:
sample_df.to_csv(os.path.join(project_dir, "data", "processed", "airline-sentiment-datasets.csv"), index=False)