# LinkedIn Posts

In [1]:
import pandas as pd
import spacy

In [2]:
df = pd.read_json('posts.json')

In [3]:
df

Unnamed: 0,descripcion,fecha
0,Here we have the lead trends for 2022!!!▪️Arti...,17/12/2021
1,It’s almost 2022 and we want to share the AI l...,15/12/2021
2,"""...Here we [...] demonstrate a method by whic...",10/12/2021
3,The limit of RPA is the imagination of the pro...,9/12/2021
4,Introducing an indoor garden controlled with a...,2/12/2021
...,...,...
266,Have you seen the future of the creative proce...,27/3/2023
267,It's fascinating how our understanding of brai...,29/3/2023
268,This is a summary of our last article: Artific...,30/3/2023
269,Nature has inspired a lot of innovations from ...,31/3/2023


In [4]:
df['descripcion'][56]

'▪️Data are the new gold!! If you have organized quality data, you started the first step to digital transformation. ▪️ #data + #AI = Best business decision-making to boost your enterprise.'

In [5]:
corpus=df["descripcion"]

In [6]:
# !python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_lg")

In [7]:
def clean_hashtag_url(post):
    """
    remove all hastags and website links from a string
    """    

    return " ".join(word for word in post.split(' ') if ("#" not in word and "http" not in word))

In [8]:
corpus = corpus.apply(clean_hashtag_url)

In [9]:
import re 

def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """

    return token.is_punct or token.is_space


def rm_pattern(post):
    """
    function returning a string without "...see more" and website links from the post. 
    This function uses "re.sub" to remove a specific pattern"""

    post = re.sub("…see more",'', post) # replace pattern by an empty string
    post = re.sub('http','',post)
    return post



def rules(token):
    """
    conditions to select a specific token for the corpus cleaning
    used with all() function : return True if all True.
    Conditions are : no pure puncuation - no pure whitespace - not a stopword 
    - not a #word
    """

    return [not punct_space(token),
            token not in nlp.Defaults.stop_words] 

In [10]:
def corpus_cleaning(posts):
    """
    generator function using spaCy to parse posts,
    remove "...see more" pattern, website links, lemmatize the text, lowercase words, and apply all the conditions we have set in rules.
    What this function returns is a generator (a "list") of individual tokens contained in lists. 
    """
    
    for post in nlp.pipe(posts.apply(rm_pattern)):
        yield ' '.join([token.lemma_.lower() for token in post if all(rules(token))])

In [11]:
preprocessed_posts = corpus_cleaning(corpus)
preprocessed_posts

<generator object corpus_cleaning at 0x00000205C90B4AC0>

In [12]:
streamed_posts = (post.split(' ') for post in preprocessed_posts)
streamed_posts

<generator object <genexpr> at 0x00000205C90B52A0>

In [13]:
# for post in streamed_posts:
#     print(post)

In [14]:
all_posts = []

for streamed_post in streamed_posts:
    post = ' '.join(streamed_post)
    all_posts.append(post)

In [15]:
all_posts

['here we have the lead trend for 2022!!! ▪ ️artificial intelligence ▪ ️data and analytics ▪ ️rpawhich one will you implement in your business',
 'it ’ almost 2022 and we want to share the ai lead trend for the future year',
 'here we demonstrate a method by which can aid mathematician in discover new and that be simply amazing how thing be evolve in this field',
 'the limit of rpa be the imagination of the programmer 🤖 ▪ ️can you tell we a rpa out of the box application juan carlos castaño valencia diana cristhina pérez pérez fabian esteban peña castillo',
 'introduce an indoor garden control with an app and ai to enjoy fresh vegetable every day it will let you grow 64 plant simultaneously green ai mashable',
 'five way al contribute to wildlifeconservation 1 vast data collection 📊 2 study and protect specie 🐨 3 track wildlife pattern 📈 4 prediction of endanger specie 🐼 5 classify animal specie',
 'the participant of the plan semilla apprenticeship talk about their experience in the l

In [16]:
df['descripcion_clean'] = all_posts

In [17]:
df.head()

Unnamed: 0,descripcion,fecha,descripcion_clean
0,Here we have the lead trends for 2022!!!▪️Arti...,17/12/2021,here we have the lead trend for 2022!!! ▪ ️art...
1,It’s almost 2022 and we want to share the AI l...,15/12/2021,it ’ almost 2022 and we want to share the ai l...
2,"""...Here we [...] demonstrate a method by whic...",10/12/2021,here we demonstrate a method by which can aid ...
3,The limit of RPA is the imagination of the pro...,9/12/2021,the limit of rpa be the imagination of the pro...
4,Introducing an indoor garden controlled with a...,2/12/2021,introduce an indoor garden control with an app...
