In [1]:
#
# Start SparkNLP
#
from sparknlp.annotator import * 
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.ml import Pipeline
import sparknlp  
from pyspark.sql.functions import *
from nltk.corpus import stopwords
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.feature import IDF, CountVectorizer, HashingTF

import numpy as np
import os

%matplotlib inline
import matplotlib.pyplot as plt

spark = sparknlp.start()

In [29]:
#load data into spark
#using all the data is slow at my pc, so get just the first 100 000 tweets
! head -n10000000 training_set_tweets.txt > tweets.txt
tweets = spark.read.option("header", "false").option("sep", "\t").csv("tweets.txt")
#"/home/i/Downloads/twitter_cikm_2010/training_set_tweets.txt") 
tweets = tweets.withColumnRenamed("_c0", "user_id")
tweets = tweets.withColumnRenamed("_c1", "tweet_id")
tweets = tweets.withColumnRenamed("_c2", "tweet")
tweets = tweets.withColumnRenamed("_c3", "dt")
tweets = tweets.where("tweet is NOT NULL")

In [3]:
docAsm = DocumentAssembler().setInputCol("tweet").setOutputCol("document")
sentDet = SentenceDetector().setInputCols(["document"]).setOutputCol("sentences")
tokenizer = Tokenizer().setInputCols(["sentences"]).setOutputCol("tokens")

normalizer = Normalizer().setInputCols(["tokens"]).setOutputCol("normal")
lemmatizer = LemmatizerModel.pretrained().setInputCols(["tokens"]).setOutputCol("lemma")
    
    


lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [30]:
#Process tweets
pipeline = Pipeline(stages=[docAsm, sentDet, tokenizer, normalizer, lemmatizer])
m = pipeline.fit(tweets)
tweets = m.transform(tweets)

In [34]:
#twitter handles of top 100 most followed performers, https://www.theguardian.com/news/datablog/2013/apr/19/twitter-music-app-100-most-followed-musicians
top_twitter_performers = spark.read.option("header", "true").option("sep", ",").csv("twitter.top100.handles.csv") 
#list of music artists, https://github.com/napsternxg/TwitterNER/tree/master/data/cleaned/custom_lexicons
music_artist_names = spark.read.option("header", "true").csv("music.artists.names.txt") 
#select only full artist names - it seems the list contains many names which are look like ordinary words
music_artist_names = music_artist_names.where(col("ArtistName").contains(" "))
#words used to detect wether a tweet has to do with a concert
concert_detection_words = spark.read.option("header", "true").csv("concert.keywords.txt") 

In [32]:
#filter tweets 
rel_tweets = tweets\
    .join(concert_detection_words, expr("array_contains(lemma.result, ConcertKeyword)"), "left")\
    .where("(ConcertKeyword IS NOT NULL)")\
    .join(music_artist_names, tweets.tweet.contains(music_artist_names.ArtistName), "left")\
    .join(top_twitter_performers, expr("array_contains(tokens.result, Handle)"), "left")\
    .withColumn("Artist", expr("CASE WHEN ArtistName IS NULL THEN TopPerformerName ELSE ArtistName END"))\
    .where("(Artist IS NOT NULL)")



In [33]:
rel_tweets.write.mode("overwrite").save("rel.tweets.parquet")