IMPORT LIBRARIES

In [1]:
import numpy as np
import pandas as pd

READ DATA

In [2]:
data  = pd.read_csv("tweets.csv")
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,568496673237929984,negative,0.6906,Customer Service Issue,0.3457,United,,skyjumper77,,0,@united or did you mean 24-26?,,2015-02-19 11:45:43 -0800,,
1,568848928466198530,positive,1.0,,,United,,MarthaH65165635,,0,@united since when did you get so good again! ...,,2015-02-20 11:05:27 -0800,,
2,570304178314190848,negative,1.0,Bad Flight,0.6784,Southwest,,tonybrancato,,0,@SouthwestAir my wife had been in group A in p...,"[41.1974934, -73.76920486]",2015-02-24 11:28:06 -0800,Chappaqua NY,Eastern Time (US & Canada)
3,568798264943902720,neutral,1.0,,,Delta,,Rachel_Lipson,,0,@JetBlue should I be nervous about a Sunday AM...,"[38.91355311, -77.04169657]",2015-02-20 07:44:08 -0800,"Washington, DC",Eastern Time (US & Canada)
4,569175432702107648,positive,1.0,,,Southwest,,vscof,,0,"@SouthwestAir flight 3970, bna-rdu had the mos...",,2015-02-21 08:42:52 -0800,"Nashville, TN",


EXTRACTING LABELS AND TWEETS

In [3]:
reviews = np.array(data['text'])
labels = np.array(data['airline_sentiment'])
reviews

array(['@united or did you mean 24-26?',
       '@united since when did you get so good again!  4 transcontinental flights in 72h.  What a pleasure/relief you were.',
       '@SouthwestAir my wife had been in group A in prev. flight but got bumped for some reason.  Alone with two kids. At least put her in group A',
       ...,
       '@VirginAmerica @ChrysiChrysic your assistance yesterday when u Cancelled Flightled our flight was to give us a hotel hotline  Shame on you!',
       "@SouthwestAir Received email w/reso's to my email that isn't associated to SWA account I own with CC info for my sister. #FRAUD please DM me",
       '@USAirways do you know what flight the pilots for 1581 are coming in on? Thanks.'],
      dtype=object)

In [4]:
labels

array(['negative', 'positive', 'negative', ..., 'negative', 'negative',
       'neutral'], dtype=object)

CLEARING PUNCTUATIONS , HYPERLINKS , AND NUMBERS

In [5]:
from string import punctuation
def clean(words):
  #  punctuation = '@!"#$%&'\()*+,-./:;<=>?[\\]^_`{|}~'
    nword = []
    for word in words:
        if word[0] not in punctuation and word.find('http')==-1 and any(char.isdigit() for char in word)==False:
            #word = word.replace(punctuation, ' ')
            for pun in punctuation:
                word = word.replace(pun,'')
                word = word.lower()
                word = word.replace(" s "," ")
                
            nword.append(word)
    
    return nword

IMPORT SPACY's ENGLISH 

In [6]:
import spacy
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

FUNCTION FOR LEMMATIZATION

In [7]:
def lemma(sentence):
    doc = nlp(sentence)
   # print(doc)
    sent = " ".join([token.lemma_ for token in doc if token.is_stop==False])
   # print(sent)
# Extract the lemma for each token and join
    return sent

LEMMATIZING AND CLEANING 

In [8]:

revs = []
for re in reviews:
    #print(re)
    re = lemma(re)
    re = re.split()
    rn = clean(re)
    revs.append(rn)
    #print(rn)
revs

[['mean'],
 ['good', 'transcontinental', 'flight', 'pleasure', 'relief'],
 ['wife', 'group', 'prev', 'flight', 'get', 'bump', 'reason', 'kid', 'group'],
 ['nervous',
  'sunday',
  'flight',
  'baltimore',
  'boston',
  'suggestion',
  'need',
  'boston',
  'monday'],
 ['flight', 'bna', 'rdu', 'excellent', 'crew', 'today'],
 ['discount', 'volunteer', 'firefighter'],
 ['absolutely',
  'horrible',
  'customer',
  'service',
  'person',
  'suppose',
  'immediately',
  'disconnect'],
 ['flyingitforward',
  'fly',
  'port',
  'au',
  'prince',
  'haiti',
  'help',
  'go',
  'earthquake',
  'relief',
  'effort',
  'lot'],
 ['far', 'good', 'step', 'denver', 'stop', 'portland'],
 ['week',
  'take',
  'money',
  'leave',
  'strand',
  'hour',
  'home',
  'rent',
  'car',
  'help'],
 ['love', 'capt', 'joe', 'flight', 'bos', 'sfo', 'fun'],
 ['seriously', 'doubt', 'sit', 'inside', 'gate'],
 ['inconvenience', 'disaster', 'phone', 'bill', 'refuse', 'pay'],
 ['flight', 'arrive', 'follow', 'flight'],
 

making an array of all words

In [9]:
all_words = []
for re in revs:
    for word in re:
        all_words.append(word)
        
print(all_words)



**tokenizing unique words**


In [10]:
from collections import Counter
counts = Counter(all_words)

sorted_counts = sorted(counts,key=counts.get,reverse=True)
word_to_int = {word:ii for ii,word in enumerate(sorted_counts,1)}

sz = len(word_to_int)

coded_revs = []
for review in revs:
    coded_revs.append([word_to_int[word] for word in review])



In [11]:
print(sz)

9481


ENCODING LABELS

In [12]:
classes=[]
for lab in labels:
    if lab=='positive':
        classes.append([0,0,1])
    elif lab=='neutral':
        classes.append([0,1,0])
    else :
        classes.append([1,0,0])

MAX AND MIN LENGTH OF REVIEWS

In [13]:
review_lens = Counter([len(x) for x in coded_revs])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 70
Maximum review length: 43


REMOVING REVIEWS WITH 0 LENGTH

In [14]:
print('Number of reviews before removing outliers: ', len(coded_revs))

## remove any reviews/labels with zero length from the coded_revs list.

# get indices of any reviews with length 0
non_zero_idx = [ii for ii, review in enumerate(coded_revs) if len(review) != 0]

# remove 0-length reviews and their labels
coded_revs = [coded_revs[ii] for ii in non_zero_idx]
classes = np.array([classes[ii] for ii in non_zero_idx])

print('Number of reviews after removing outliers: ', len(coded_revs))

Number of reviews before removing outliers:  14640
Number of reviews after removing outliers:  14570


FUNTION TO PAD REVIEWS (right)

In [15]:
def pad_features(reviews_ints, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    
    # getting the correct rows x cols shape
    features = np.zeros((len(reviews_ints), seq_length), dtype=int)

    # for each review, I grab that review and 
    for i, row in enumerate(reviews_ints):
        features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features

In [16]:
seq_length = 15

features = pad_features(coded_revs, seq_length=seq_length)

Saving our features and classes data

In [17]:
from numpy import savetxt
savetxt('features.csv',features,delimiter=',')
savetxt('classes.csv',classes,delimiter=',')