In [33]:
#!pip install findspark
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF,IDF,Tokenizer,StopWordsRemover
from pyspark.ml.classification import LinearSVC
from pyspark.sql import functions as F

In [4]:
import pyspark as ps
from pyspark.sql import SQLContext
#import warnings
#try:
    # create SparkContext on all CPUs available: in my case I have 4 CPUs on my laptop
 #   sc = ps.SparkContext('local[*]')
  #  sqlContext = SQLContext(sc)
   # print("Just created a SparkContext")
##except ValueError:
    warnings.warn("SparkContext already exists in this scope")



Just created a SparkContext


In [47]:
#create or get Spark Session
#spark = sparknlp.start()
spark = SparkSession.builder \
    .appName("spar")\
    .master("spark://spark:7077")\
    .config("spark.driver.memory","8G")\
    .config("spark.driver.maxResultSize", "2G") \
    .config("spark.kryoserializer.buffer.max", "500m")\
    .getOrCreate()
print("sparkversion", spark.version)

sparkversion 3.2.3


### Tokenizer

Il transforme les phrases du document en mots

In [48]:
tokenizer = Tokenizer(inputCol="tweet", outputCol="token")

### Stop Words Remover

Stop Words comme I, we, me, et ainsi de suite ont été supprimés car ils n’affectent pas le sens du mot.
Il supprime la ponctuation et transforme les mots en minuscules. Ensuite, transformez ces mots propres documents en mots réguliers.

In [49]:
swr = StopWordsRemover(inputCol="token",outputCol="token_features")

### Hashing

Il transforme les mots en valeurs numériques, ce processus est appelé vectorisation.

In [50]:
hashingTF = HashingTF(inputCol="token_features", outputCol="rawFeatures")# To generate Inverse Document Frequency

In [51]:
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5)

### Support Vector machine (SVM)

C’est un algorithme d’apprentissage automatique bien connu utilisé dans la classification. Il est utilisé pour mapper les caractéristiques numériques au sentiment.

In [52]:
SVC = LinearSVC(labelCol = 'label',featuresCol="features",maxIter=60)

Now, we are ready to make a full pipeline…

In [53]:
from pyspark.ml import Pipeline
pipeline = Pipeline().setStages([tokenizer,swr,hashingTF,idf,SVC])

In [54]:
training_data = spark.read.csv('train.csv',inferSchema=True,header=True)

In [55]:
training_data.show()

+---+-----+--------------------+
| id|label|               tweet|
+---+-----+--------------------+
|  1|    0| @user when a fat...|
|  2|    0|@user @user thank...|
|  3|    0|  bihday your maj...|
|  4|    0|#model   i love u...|
|  5|    0| factsguide: soci...|
|  6|    0|[2/2] huge fan fa...|
|  7|    0| @user camping to...|
|  8|    0|the next school y...|
|  9|    0|we won!!! love th...|
| 10|    0| @user @user welc...|
| 11|    0| â #ireland con...|
| 12|    0|we are so selfish...|
| 13|    0|i get to see my d...|
| 14|    1|@user #cnn calls ...|
| 15|    1|no comment!  in #...|
| 16|    0|ouch...junior is ...|
| 17|    0|i am thankful for...|
| 18|    1|retweet if you ag...|
| 19|    0|its #friday! ð...|
| 20|    0|as we all know, e...|
+---+-----+--------------------+
only showing top 20 rows



In [56]:
training_data=training_data.withColumn('tweet',F.regexp_replace('tweet',r'http\S+',''))
training_data=training_data.withColumn('tweet',F.regexp_replace('tweet','@\w+',''))
training_data=training_data.withColumn('tweet',F.regexp_replace('tweet','#',''))
training_data=training_data.withColumn('tweet',F.regexp_replace('tweet','RT',''))

training_data=training_data.withColumn('tweet',F.regexp_replace('tweet','&lt',''))
training_data=training_data.withColumn('tweet',F.regexp_replace('tweet','&gt',''))
training_data=training_data.withColumn('tweet',F.regexp_replace('tweet','&amp',''))
training_data=training_data.withColumn('tweet',F.regexp_replace('tweet','&quot',''))

training_data=training_data.withColumn('tweet',F.regexp_replace('tweet','-',''))
training_data=training_data.withColumn('tweet',F.regexp_replace('tweet',' ',''))
training_data=training_data.withColumn('tweet',F.regexp_replace('tweet','  ',''))

In [57]:
training_data.show(4)

+---+-----+--------------------+
| id|label|               tweet|
+---+-----+--------------------+
|  1|    0|whenafatherisdysf...|
|  2|    0|thanksforlyftcred...|
|  3|    0|   bihdayyourmajesty|
|  4|    0|modeliloveutakewi...|
+---+-----+--------------------+
only showing top 4 rows



In [58]:
processed_tweets = pipeline.fit(training_data)

In [59]:
processed_tweets

PipelineModel_785686941cbd

In [60]:
from elasticsearch import Elasticsearch

In [61]:
es = Elasticsearch("http://localhost:9200")
es.info(pretty=True)

  es.info(pretty=True)


ObjectApiResponse({'name': 'ab62d2783536', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'XyzgtYCEQ9m5rf9Vrgywmg', 'version': {'number': '7.17.7', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '78dcaaa8cee33438b91eca7f5c7f56a70fec9e80', 'build_date': '2022-10-17T15:29:54.167373105Z', 'build_snapshot': False, 'lucene_version': '8.11.1', 'minimum_wire_compatibility_version': '6.8.0', 'minimum_index_compatibility_version': '6.0.0-beta1'}, 'tagline': 'You Know, for Search'})

In [62]:
from kafka import KafkaConsumer
#import pydoop.hdfs as hdfs
consumer = KafkaConsumer('covid',bootstrap_servers=['localhost:9092'])

import pandas as pd
columns = ['id','tweet', 'label']

for message in consumer:
   #print(i) 
   values = message.value.decode('utf-8').splitlines()
   dfObj = pd.DataFrame(values,columns = ['tweet']) 
   dfObj.reset_index(drop=True) 
   dfObj=spark.createDataFrame(dfObj) 
    
   dfObj=dfObj.withColumn('tweet',F.regexp_replace('tweet',r'http\S+',''))
   dfObj=dfObj.withColumn('tweet',F.regexp_replace('tweet','@\w+',''))
   dfObj=dfObj.withColumn('tweet',F.regexp_replace('tweet','#',''))
   dfObj=dfObj.withColumn('tweet',F.regexp_replace('tweet','RT',''))

   dfObj=dfObj.withColumn('tweet',F.regexp_replace('tweet','&lt',''))
   dfObj=dfObj.withColumn('tweet',F.regexp_replace('tweet','&gt',''))
   dfObj=dfObj.withColumn('tweet',F.regexp_replace('tweet','&amp',''))
   dfObj=dfObj.withColumn('tweet',F.regexp_replace('tweet','&quot',''))

   dfObj=dfObj.withColumn('tweet',F.regexp_replace('tweet','-',' '))
   dfObj=dfObj.withColumn('tweet',F.regexp_replace('tweet','  ',' '))
   #print(dfObj.select('tweet').show())
   result=processed_tweets.transform(dfObj)
   t=result.select('tweet').collect()
   p=result.select('prediction').collect()
   print(result.select(['tweet','prediction']).show())
   ##################################################
    ##################################################
   #results = processed_tweets.transform(test_data)
   from datetime import datetime
   # Getting the current date and time
   dt = datetime.now()
   # getting the timestamp
   ts = datetime.timestamp(dt) 
    # write data
   es.index(
         index='tweets_mondial',
         document={
          'tweet': t[0]['tweet'],
          'prediction': p[0]['prediction'],
          'Timestamp' :ts
 })
  
    



+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|" : They\u2019ve ...|       0.0|
+--------------------+----------+

None


  es.index(


+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|" : Honble sir pl...|       0.0|
+--------------------+----------+

None


  es.index(


+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|" : COVID is not ...|       0.0|
+--------------------+----------+

None


  es.index(


+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|" : Crematoriums ...|       0.0|
+--------------------+----------+

None


  es.index(


+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|" Agree with taxi...|       0.0|
+--------------------+----------+

None


  es.index(


+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|" They are in the...|       0.0|
+--------------------+----------+

None


  es.index(


+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|" : If you were O...|       0.0|
+--------------------+----------+

None


  es.index(


+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|"WASHINGTON (AP) ...|       0.0|
+--------------------+----------+

None


  es.index(


+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|" : Published in ...|       0.0|
+--------------------+----------+

None


  es.index(


+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|" : Please retwee...|       0.0|
+--------------------+----------+

None


  es.index(


+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|" : Funny how, wh...|       0.0|
+--------------------+----------+

None


  es.index(


+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|"China didn\u2019...|       0.0|
+--------------------+----------+

None


  es.index(


+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|" : Just saw an a...|       0.0|
+--------------------+----------+

None


  es.index(


+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|" : Breaking: Isr...|       0.0|
+--------------------+----------+

None


  es.index(


+--------------------+----------+
|               tweet|prediction|
+--------------------+----------+
|" : \ud83d\udd34 ...|       0.0|
+--------------------+----------+

None


  es.index(
ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "C:\Apache-spark\spark-3.2.3-bin-hadoop2.7\python\lib\py4j-0.10.9.5-src.zip\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "C:\Apache-spark\spark-3.2.3-bin-hadoop2.7\python\lib\py4j-0.10.9.5-src.zip\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "C:\Users\pc\anaconda3\lib\socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 