In [None]:
!pip install -q pyspark==3.3.0 spark-nlp==4.2.0

In [None]:
import sparknlp
spark = sparknlp.start(gpu=True) 
sparknlp.start(gpu=True)
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
import pandas as pd
print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

In [None]:
from google.colab import files
data_to_load = files.upload()
import io
data= pd.read_csv(io.BytesIO(data_to_load['data_news.csv']))
data.head()

Saving data_news.csv to data_news.csv


Unnamed: 0,category,article_content
0,Economy,Mongolian tent-dwellers face growing hardship ...
1,Economy,Beijing to roll out subsidies and tax credits ...
2,Economy,Prosecutors said he defrauded customers by mis...
3,Economy,The world’s biggest cryptocurrency exchange sa...
4,Economy,Consumer prices saw the slowest increase in 15...


In [None]:
import pyspark.sql.functions as F
df = spark.read\
                .option("header", "true")\
                .csv("data_news.csv")
df.show(10, truncate=50)

+--------+--------------------------------------------------+
|category|                                   article_content|
+--------+--------------------------------------------------+
| Economy|Mongolian tent-dwellers face growing hardship a...|
| Economy|Beijing to roll out subsidies and tax credits t...|
| Economy|Prosecutors said he defrauded customers by misa...|
| Economy|The world’s biggest cryptocurrency exchange sai...|
| Economy|Consumer prices saw the slowest increase in 15 ...|
| Economy|The biggest rail union start national strike ov...|
| Economy|Customer assets at FTX were commingled with tho...|
| Economy|IMF board approval of the proposed three-year l...|
| Economy|Airbnb units in the city’s central areas can li...|
| Economy|Some fear the Alternative for Germany party cou...|
+--------+--------------------------------------------------+
only showing top 10 rows



In [None]:
from pyspark.sql.functions import col

df.groupBy("category") \
      .count() \
      .orderBy(col("count").desc()) \
      .show()


+--------------------+-----+
|            category|count|
+--------------------+-----+
|             Economy| 3032|
|Science & Technology| 2973|
|              Sports| 2921|
+--------------------+-----+



In [None]:
#  split the dataset

(trainingData, testData) = df.randomSplit([0.9, 0.1], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 8039
Test Dataset Count: 887


In [None]:
from pyspark.sql.functions import col

trainingData.groupBy("category") \
      .count() \
      .orderBy(col("count").desc()) \
      .show()


+--------------------+-----+
|            category|count|
+--------------------+-----+
|             Economy| 2749|
|Science & Technology| 2680|
|              Sports| 2610|
+--------------------+-----+



In [None]:
testData.groupBy("category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+--------------------+-----+
|            category|count|
+--------------------+-----+
|              Sports|  311|
|Science & Technology|  293|
|             Economy|  283|
+--------------------+-----+



**model 1 acc=91%**

In [None]:
document = DocumentAssembler()\
              .setInputCol("article_content")\
              .setOutputCol("document")
    
bert_cmlm = BertSentenceEmbeddings.pretrained('sent_bert_use_cmlm_en_base', 'en')\
              .setInputCols(["document"])\
              .setOutputCol("sentence_embeddings")

classsifierdl = ClassifierDLApproach()\
              .setInputCols(["sentence_embeddings"])\
              .setOutputCol("class")\
              .setLabelColumn("category")\
              .setMaxEpochs(10)\
              .setEnableOutputLogs(True)\
              .setLr(0.001)

bert_cmlm_clf_pipeline = Pipeline(stages = [document,
                                            bert_cmlm,
                                            classsifierdl])

sent_bert_use_cmlm_en_base download started this may take some time.
Approximate size to download 391.6 MB
[OK!]


In [None]:
Model2 = bert_cmlm_clf_pipeline.fit(trainingData)

In [None]:
preds2 = Model2.transform(testData)

In [None]:
preds= preds2.select('category','article_content',"class.result").toPandas()
preds['result'] = preds['result'].apply(lambda x : x[0])
from sklearn.metrics import classification_report
print (classification_report(preds['category'], preds['result']))

                      precision    recall  f1-score   support

             Economy       0.85      0.85      0.85       283
Science & Technology       0.87      0.85      0.86       293
              Sports       0.97      1.00      0.98       311

            accuracy                           0.90       887
           macro avg       0.90      0.90      0.90       887
        weighted avg       0.90      0.90      0.90       887



In [None]:
from google.colab import drive


In [None]:
Model2.save('/content/drive/MyDrive/use_clf')

In [None]:
Model2.save('my_model') 

In [None]:
Model2.stages

[DocumentAssembler_61ee157bf1ec,
 BERT_SENTENCE_EMBEDDINGS_561b5fbb81d8,
 ClassifierDLModel_33ca83ea43b9]

In [None]:
Model2.stages[2].write().overwrite().save('MultilabelClfBert')

In [None]:
ClassifierDLModel = ClassifierDLModel.load('MultilabelClfBert')

In [None]:
# Generate prediction Pipeline with loaded Model 
ld_pipeline = Pipeline(stages=[document, bert_cmlm, ClassifierDLModel])
ld_pipeline_model = ld_pipeline.fit(spark.createDataFrame([['']]).toDF("text"))

In [None]:
# Apply Model Transform to testData
ld_preds = ld_pipeline_model.transform(testData)

In [None]:
ld_preds_df = ld_preds.select('category','article_content',"class.result").toPandas()

In [None]:
ld_preds_df.head(10)

Unnamed: 0,category,article_content,result
0,Economy,$52bn bipartisan deal would subsidise US semic...,[Economy]
1,Economy,"A ban is in the works but the question is, how...",[Economy]
2,Economy,A donor conference aims to help Moldova deal w...,[Economy]
3,Economy,A new joint task force to work on slashing the...,[Economy]
4,Economy,A new report says most exports went to Europea...,[Economy]
5,Economy,A number of countries have pledged $8.5bn and ...,[Science & Technology]
6,Economy,A surging US dollar is making local currencies...,[Economy]
7,Economy,A worldwide survey from payroll provider ADP f...,[Science & Technology]
8,Economy,"Adoption rates of the eNaira, Africa’s first o...",[Science & Technology]
9,Economy,"After Ukrainian leader’s speech, US President ...",[Economy]


**Model 2 acc=89%**

In [None]:
# actual content is inside description column
document = DocumentAssembler()\
    .setInputCol("article_content")\
    .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained()\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")
# the classes/labels/categories are in category column
classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("category")\
  .setMaxEpochs(5)\
  .setEnableOutputLogs(True)
use_clf_pipeline = Pipeline(
    stages = [
        document,
        use,
        classsifierdl
    ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [None]:
Model = use_clf_pipeline.fit(trainingData)

In [None]:
preds = Model.transform(testData)

In [None]:
preds.select('category','article_content',"class.result").show(10, truncate=80)

+--------+--------------------------------------------------------------------------------+----------------------+
|category|                                                                 article_content|                result|
+--------+--------------------------------------------------------------------------------+----------------------+
| Economy|$52bn bipartisan deal would subsidise US semiconductor manufacturing to boost...|[Science & Technology]|
| Economy|A ban is in the works but the question is, how effective will it be in limiti...|             [Economy]|
| Economy|A donor conference aims to help Moldova deal with the impact of the Ukraine w...|             [Economy]|
| Economy|A new joint task force to work on slashing the bloc’s dependence on Russian f...|             [Economy]|
| Economy|A new report says most exports went to European countries as Kyiv urges the W...|             [Economy]|
| Economy|A number of countries have pledged $8.5bn and technical assistance to 

In [None]:
preds= preds.select('category','article_content',"class.result").toPandas()

preds['result'] = preds['result'].apply(lambda x : x[0])
from sklearn.metrics import classification_report

print (classification_report(preds['category'], preds['result']))

                      precision    recall  f1-score   support

             Economy       0.88      0.79      0.83       283
Science & Technology       0.82      0.90      0.86       293
              Sports       0.98      0.97      0.98       311

            accuracy                           0.89       887
           macro avg       0.89      0.89      0.89       887
        weighted avg       0.89      0.89      0.89       887



In [None]:
from pyspark.mllib.evaluation import MulticlassMetrics

**model 3 acc=89%**

In [None]:
document_assembler = DocumentAssembler() \
            .setInputCol("article_content") \
            .setOutputCol("document")
    
tokenizer = Tokenizer() \
            .setInputCols(["document"]) \
            .setOutputCol("token")
          
normalizer = Normalizer() \
            .setInputCols(["token"]) \
            .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
            .setInputCols("normalized")\
            .setOutputCol("cleanTokens")\
            .setCaseSensitive(False)

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
            .setInputCols(["cleanTokens"]) \
            .setOutputCol("lemma")
            
glove_embeddings = WordEmbeddingsModel().pretrained() \
                        .setInputCols(["document",'lemma'])\
                        .setOutputCol("embeddings")\
                        .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
                        .setInputCols(["document", "embeddings"]) \
                        .setOutputCol("sentence_embeddings") \
                        .setPoolingStrategy("AVERAGE")

classsifierdl = ClassifierDLApproach()\
                        .setInputCols(["sentence_embeddings"])\
                        .setOutputCol("class")\
                        .setLabelColumn("category")\
                        .setMaxEpochs(10)\
                        .setLr(0.001)\
                        .setBatchSize(8)\
                        .setEnableOutputLogs(True)
                        #.setOutputLogsPath('logs')

clf_pipeline = Pipeline(
    stages=[
        document_assembler, 
        tokenizer,
        normalizer,
        stopwords_cleaner, 
        lemma, 
        glove_embeddings,
        embeddingsSentence,
        classsifierdl])

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [None]:
clf_pipelineModel = clf_pipeline.fit(trainingData)

In [None]:
preds4 = clf_pipelineModel.transform(testData)
preds4.select('category','article_content',"class.result").show(10, truncate=80)

+--------+--------------------------------------------------------------------------------+----------------------+
|category|                                                                 article_content|                result|
+--------+--------------------------------------------------------------------------------+----------------------+
| Economy|$52bn bipartisan deal would subsidise US semiconductor manufacturing to boost...|             [Economy]|
| Economy|A ban is in the works but the question is, how effective will it be in limiti...|             [Economy]|
| Economy|A donor conference aims to help Moldova deal with the impact of the Ukraine w...|             [Economy]|
| Economy|A new joint task force to work on slashing the bloc’s dependence on Russian f...|             [Economy]|
| Economy|A new report says most exports went to European countries as Kyiv urges the W...|             [Economy]|
| Economy|A number of countries have pledged $8.5bn and technical assistance to 

In [None]:
preds= preds4.select('category','article_content',"class.result").toPandas()
preds['result'] = preds['result'].apply(lambda x : x[0])
from sklearn.metrics import classification_report
print (classification_report(preds['category'], preds['result']))

                      precision    recall  f1-score   support

             Economy       0.84      0.82      0.83       283
Science & Technology       0.84      0.85      0.85       293
              Sports       0.96      0.97      0.97       311

            accuracy                           0.88       887
           macro avg       0.88      0.88      0.88       887
        weighted avg       0.88      0.88      0.88       887



In [None]:
!pip install flask_ngrok

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flask_ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask-ngrok
Successfully installed flask-ngrok-0.0.25


**deployement avec flask**

In [None]:
from flask_ngrok import run_with_ngrok
import numpy as np
from flask import Flask, request, render_template
from keras.models import load_model

from pyspark.ml import Pipeline

# Create flask app
flask_app = Flask(__name__)
#run_with_ngrok(flask_app)
lassifierDLModel = ClassifierDLModel.load('MultilabelClfBert')
# Generate prediction Pipeline with loaded Model 
ld_pipeline = Pipeline(stages=[document, bert_cmlm, ClassifierDLModel])
ld_pipeline_model = ld_pipeline.fit(spark.createDataFrame([['']]).toDF("text"))


@flask_app.route("/")
def Home():
    return render_template("index.html")

@flask_app.route("/predict", methods = ["POST"])
def predict():
    float_features = [float(x) for x in request.form.values()]
    features = [np.array(float_features)]
    prediction = ld_pipeline_model.transform(features)
    #prediction = ClassifierDLModel.predict(features)
    return render_template("index.html", prediction_text = "The article categorie is {}".format(prediction))

if __name__ == "__main__":
    flask_app.run()
