# Data Preprocessing and data classification.

In [1]:
import json
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd

### pre-processing of tweets

In [2]:
def processTweet2(tweet):
  
    tweet = tweet.lower() #Convert to lower case
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet) #Convert www.* or https?://* to URL
    tweet = re.sub('@[^\s]+','AT_USER',tweet) #Convert @username to AT_USER
    tweet = re.sub('[\s]+', ' ', tweet) #Remove additional white spaces
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) #Replace #word with word
    tweet = tweet.strip('\'"') #trim
    
    return tweet

### look for 2 or more repetitions of character and replace with the character itself

In [4]:
def replaceTwoOrMore(s):
    pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
    return pattern.sub(r"\1\1", s)

### Filtering tweet words for feature vector

In [5]:
def getFeatureVector(tweet):
    featureVector = []
    words = tweet.split() # split tweet into words
    for w in words:
        w = replaceTwoOrMore(w) # replace two or more with two occurrences
        w = w.strip('\'"?,.') # strip punctuation
        val = re.search(r"^[a-zA-Z][a-zA-Z0-9]*$", w) # check if the word stats with an alphabet
        if (w in stopwords.words('german') or val is None): # ignore if it is a stop word
            continue
        else:
            featureVector.append(w.lower())
    return featureVector

### training with labeled data

In [6]:
tweettrain = pd.read_csv("classified_data.csv",encoding="ISO-8859-1")
tweets = []
featureList = []
for i in range(len(tweettrain)):
    sentiment = tweettrain['sentiment'][i]
    tweet = tweettrain['text'][i]
    processedTweet = processTweet2(tweet)
    featureVector = getFeatureVector(processedTweet)
    featureList.extend(featureVector)
    tweets.append((featureVector, sentiment))


In [7]:
print (featureList)

['theresamay', 'kurs', 'einwanderer', 'greatbritain', 'halifax', 'immigration', 'theresa', 'mays', 'vision', 'macht', 'angst', 'theresa', 'may', 'macht', 'ganz', 'neues', 'land', 'welt', 'theresa', 'may', 'kurs', 'einwanderer', 'url', 'may', 'tut', 'gb', 'merkel', 'regiert', 'volk', 'bordesholmer', 'baron', 'ralf', 'stegner', 'versteht', 'welt', 'afd', 'merkel', 'spd', 'stegner', 'url', 'trotz', 'deutschen', 'leben', 'url', 'bald', 'schlampe', 'url', 'zirkus', 'clubkinder', 'zirkus', 'schanze', 'hamburg', 'refugeecanteen', 'refugeeswelcome', 'url', 'merkel', 'macht', 'geht', 'volke', 'url', 'angela', 'merkel', 'machte', 'schlimmer', 'cdu', 'parallel', 'merkel', 'ganze', 'url', 'spd', 'schafft', 'mitte', 'via', 'besitze', 'deutsche', 'bundestag', 'gewaltverbot', 'vereinten', 'nationen', 'kriegshetzer', 'ard', 'zdf', 'unwort', 'url', 'merkel', 'propaganda', 'angela', 'merkel', 'beliebtesten', 'fdp', 'politbarometer', 'welt', 'url', 'url', 'gebete', 'kirche', 'bitte', 'cdu', 'csu', 'linke

In [8]:
print(tweets)

[(['theresamay', 'kurs', 'einwanderer', 'greatbritain', 'halifax', 'immigration'], 'neutral'), (['theresa', 'mays', 'vision', 'macht', 'angst'], 'negative'), (['theresa', 'may', 'macht', 'ganz', 'neues', 'land', 'welt'], 'neutral'), (['theresa', 'may', 'kurs', 'einwanderer', 'url', 'may', 'tut', 'gb', 'merkel', 'regiert', 'volk'], 'negative'), (['bordesholmer', 'baron', 'ralf', 'stegner', 'versteht', 'welt', 'afd', 'merkel', 'spd', 'stegner', 'url'], 'negative'), (['trotz', 'deutschen', 'leben', 'url'], 'negative'), (['bald', 'schlampe', 'url'], 'negative'), (['zirkus', 'clubkinder', 'zirkus', 'schanze', 'hamburg', 'refugeecanteen', 'refugeeswelcome', 'url'], 'neutral'), (['merkel', 'macht', 'geht', 'volke', 'url'], 'negative'), (['angela', 'merkel', 'machte', 'schlimmer', 'cdu'], 'negative'), (['parallel', 'merkel', 'ganze', 'url'], 'positive'), (['spd', 'schafft', 'mitte', 'via'], 'neutral'), (['besitze', 'deutsche', 'bundestag'], 'neutral'), (['gewaltverbot', 'vereinten', 'nationen'

### Method to extract feactures

In [9]:
def extract_features(tweet):
    tweet_words = set(tweet)
    features = {}
    for word in featureList:
        features['contains(%s)' % word] = (word in tweet_words)
    return features

In [10]:
sample_tweet = "En gewoon op Merkel blijven stemmen"
print(extract_features(sample_tweet))

{'contains(theresamay)': False, 'contains(kurs)': False, 'contains(einwanderer)': False, 'contains(greatbritain)': False, 'contains(halifax)': False, 'contains(immigration)': False, 'contains(theresa)': False, 'contains(mays)': False, 'contains(vision)': False, 'contains(macht)': False, 'contains(angst)': False, 'contains(may)': False, 'contains(ganz)': False, 'contains(neues)': False, 'contains(land)': False, 'contains(welt)': False, 'contains(url)': False, 'contains(tut)': False, 'contains(gb)': False, 'contains(merkel)': False, 'contains(regiert)': False, 'contains(volk)': False, 'contains(bordesholmer)': False, 'contains(baron)': False, 'contains(ralf)': False, 'contains(stegner)': False, 'contains(versteht)': False, 'contains(afd)': False, 'contains(spd)': False, 'contains(trotz)': False, 'contains(deutschen)': False, 'contains(leben)': False, 'contains(bald)': False, 'contains(schlampe)': False, 'contains(zirkus)': False, 'contains(clubkinder)': False, 'contains(schanze)': False,

In [11]:
featureList = list(set(featureList)) # Remove featureList duplicates

In [12]:
training_set = nltk.classify.util.apply_features(extract_features, tweets)
NBClassifier = nltk.NaiveBayesClassifier.train(training_set)


In [13]:
tweettest = pd.read_csv("tweets_extracted.csv",encoding="ISO-8859-1")

for i in range(len(tweettest)):
    tweet1 = tweettest['text'][i]
    processedTestTweet = processTweet2(tweet1)
    sentiment = NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet)))
    print(sentiment)


negative
negative
negative
positive
positive
negative
neutral
negative
positive
negative
neutral
positive
neutral
neutral
neutral
positive
negative
negative
neutral
negative
negative
negative
positive
positive
neutral
neutral
positive
positive
neutral
negative
positive
negative
neutral
neutral
neutral
negative
negative
negative
neutral
neutral
negative
negative
negative
neutral
positive
neutral
neutral
negative
negative
neutral
negative
negative
negative
negative
positive
neutral
positive
positive
neutral
positive
neutral
neutral
positive
negative
negative
neutral
neutral
negative
negative
neutral
negative
neutral
negative
negative
negative
negative
neutral
positive
neutral
negative
negative
negative
neutral
negative
negative
negative
neutral
positive
neutral
negative
negative
negative
neutral
positive
negative
negative
negative
negative
negative
negative
positive
negative
negative
negative
positive
negative
positive
neutral
negative
negative
neutral
negative
negative
negative
negative

neutral
neutral
negative
negative
negative
positive
negative
negative
negative
negative
positive
neutral
negative
positive
negative
negative
positive
negative
positive
negative
negative
negative
negative
negative
positive
negative
negative
negative
positive
negative
negative
positive
negative
positive
negative
negative
negative
negative
positive
negative
neutral
negative
neutral
negative
neutral
positive
negative
positive
negative
negative
negative
negative
neutral
negative
negative
negative
positive
neutral
negative
negative
negative
neutral
positive
negative
negative
negative
positive
negative
negative
negative
negative
negative
negative
negative
neutral
negative
negative
negative
positive
neutral
negative
negative
negative
negative
negative
negative
negative
negative
neutral
negative
negative
negative
negative
negative
negative
negative
negative
negative
negative
neutral
neutral
negative
negative
negative
neutral
negative
positive
negative
negative
negative
negative
negative
positiv

negative
negative
negative
negative
negative
neutral
negative
positive
negative
positive
positive
negative
positive
neutral
negative
neutral
negative
negative
negative
positive
negative
negative
negative
negative
positive
negative
negative
negative
positive
positive
negative
positive
negative
negative
positive
positive
negative
negative
negative
negative
positive
positive
negative
negative
neutral
negative
negative
neutral
negative
negative
neutral
negative
negative
negative
negative
negative
negative
negative
negative
negative
neutral
negative
neutral
negative
negative
negative
negative
neutral
negative
negative
negative
neutral
negative
positive
negative
negative
neutral
negative
negative
negative
neutral
negative
neutral
neutral
negative
positive
negative
negative
negative
negative
negative
neutral
neutral
negative
negative
negative
negative
negative
negative
positive
positive
negative
negative
negative
neutral
negative
negative
negative
negative
negative
negative
negative
negative


negative
negative
positive
positive
negative
negative
negative
negative
positive
negative
positive
negative
neutral
negative
negative
negative
negative
positive
negative
negative
negative
neutral
positive
negative
negative
negative
negative
negative
negative
positive
negative
negative
negative
negative
negative
negative
positive
negative
positive
negative
negative
negative
negative
negative
negative
positive
negative
negative
negative
neutral
negative
negative
negative
negative
negative
positive
negative
neutral
negative
negative
negative
negative
negative
positive
negative
negative
negative
neutral
neutral
neutral
positive
neutral
negative
negative
neutral
negative
negative
negative
negative
positive
neutral
positive
negative
negative
negative
negative
negative
positive
negative
negative
negative
negative
negative
positive
neutral
neutral
negative
neutral
positive
negative
negative
negative
neutral
negative
negative
negative
negative
negative
positive
negative
negative
negative
neutra

In [18]:
print (NBClassifier.show_most_informative_features(10))

Most Informative Features
         contains(trump) = True           neutra : positi =      4.6 : 1.0
        contains(berlin) = True           positi : negati =      4.6 : 1.0
   contains(deutschland) = True           positi : negati =      3.8 : 1.0
          contains(geht) = True           positi : negati =      3.8 : 1.0
          contains(seit) = True           positi : negati =      3.8 : 1.0
contains(tagderdeutscheneinheit) = True           neutra : positi =      3.2 : 1.0
         contains(heute) = True           neutra : negati =      3.1 : 1.0
     contains(bundestag) = True           neutra : negati =      3.1 : 1.0
         contains(macht) = True           negati : positi =      2.9 : 1.0
      contains(deutsche) = True           neutra : positi =      2.5 : 1.0
None


#### for Json format files

In [None]:
""""
with open('final.json', encoding="utf8") as f:
    for line in f:
        tweet = json.loads(line)
        text1 = tweet['text']
        processedTestTweet = processTweet2(text1)
        sentiment = NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet)))
        print(sentiment)"""