# Corpus Collection & Analysis

## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import brown
from collections import Counter
from nltk.corpus import stopwords

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mujta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Analyzing the original data

The original LinkedIn data is stored in following directory: "../data/linkedIn_data.csv"

In [2]:
original_data = pd.read_csv("../data/linkedIn_data.csv")

original_data.head()

  original_data = pd.read_csv("../data/linkedIn_data.csv")


Unnamed: 0.1,Unnamed: 0,name,headline,location,followers,connections,about,time_spent,content,content_links,media_type,media_url,num_hashtags,hashtag_followers,hashtags,reactions,comments,views,votes
0,0,Nicholas Wyman,CEO IWSI Group,,6484.0,500+,Nicholas Wyman for the past 25 years has shone...,1 day ago,Robert Lerman writes that achieving a healthy...,[['https://www.linkedin.com/in/ACoAAACy1HkBviR...,article,['https://www.urban.org/urban-wire/its-time-mo...,4,0,"[['#workbasedlearning', 'https://www.linkedin....",12,1,,
1,1,Nicholas Wyman,CEO IWSI Group,,6484.0,500+,Nicholas Wyman for the past 25 years has shone...,1 week ago,"National disability advocate Sara Hart Weir, ...",[['https://www.linkedin.com/in/ACoAAAHsfJgBb7_...,,[],0,0,[],11,0,,
2,2,Nicholas Wyman,CEO IWSI Group,,6484.0,500+,Nicholas Wyman for the past 25 years has shone...,2 months ago,,[],,[],0,0,[],15,0,,
3,3,Nicholas Wyman,CEO IWSI Group,,6484.0,500+,Nicholas Wyman for the past 25 years has shone...,2 months ago,Exploring in this months Talent Management & H...,[['https://www.linkedin.com/in/ACoAAAADlGIBLfn...,article,['https://www.tlnt.com/apprenticeships-that-br...,4,0,"[['#careerplanning', 'https://www.linkedin.com...",44,0,,
4,4,Nicholas Wyman,CEO IWSI Group,,6484.0,500+,Nicholas Wyman for the past 25 years has shone...,2 months ago,I count myself fortunate to have spent time wi...,[['https://www.linkedin.com/in/ACoAABhNxDUB9IX...,article,['https://gritdaily.com/the-legacy-of-verified...,3,0,"[['#verifiedresumes', 'https://www.linkedin.co...",22,2,,


In [4]:
original_data['media_type'].value_counts()

media_type
article       15144
image          8708
video          2690
document        113
poll             86
entity           32
newsletter        4
view              2
Name: count, dtype: int64

The data has a basic annotation with media type column, which gives categories upon different kind of posts by influencers. Like 'entity' posts are hiring posts as seen below. But 'article', 'video' and 'image' posts can be further labelled into many categories which will be useful in analysis for LinkedIn users. That is why we will sample from 'article' 'video' and 'image' media_type only as they are large in number while others are too specific or have a very low number of instances.

### Entity (Hiring Posts)

In [5]:
original_data.loc[original_data['media_type']=='entity']['content'].head(5)

1011    GUESS WHO'S HIRING!?  At 500, we're on a missi...
1012    WE'RE HIRING! Come join me in bringing  500 St...
1396                                       Join our team!
1588    Wonolo is growing rapidly and we're hiring an ...
1589    Wonolo is growing rapidly and we’re hiring an ...
Name: content, dtype: object

### Article

In [6]:
original_data.loc[original_data['media_type']=='article']['content'].head(5)

0    Robert Lerman  writes that achieving a healthy...
3    Exploring in this months Talent Management & H...
4    I count myself fortunate to have spent time wi...
5    Online job platforms are a different way of wo...
6    Between the burgeoning unemployment rates and ...
Name: content, dtype: object

### Video

In [7]:
original_data.loc[original_data['media_type']=='video']['content'].head(5)

35     The sign above the steel gates of Auschwitz re...
37     Great to talk with Fox Business today on why c...
170    Watch this live podcast on trauma tonight:  ht...
192    A fun podcast with an excellent educational le...
249    For those of you who couldn't attend the live ...
Name: content, dtype: object

### Image

In [8]:
original_data.loc[original_data['media_type']=='image']['content'].head(5)

23    No-one can be sure how America will ‘snap back...
48    Our new report, Using Modern Apprenticeship to...
62    Kentucky announces an innovative new registere...
64    Forging partnerships with industry is critical...
75    Students in Darwin in Australia’s Top End will...
Name: content, dtype: object

### Missing values in Content (posts) Column

In [43]:
original_data["content"].isna().sum()

2016

## Preparing data using sampling

### Extracting 'article' 'video' and 'image' media_type posts and removing empty posts for sampling

In [17]:
cleaned_data = original_data[original_data['media_type'].isin(['article','video', 'image' ])]

cleaned_data = cleaned_data.dropna(subset=["content"])

cleaned_data['media_type'].value_counts()

media_type
article    14590
image       8614
video       2674
Name: count, dtype: int64

### Checking duplicate posts in original dataset for  'article' 'video' and 'image' media_type posts

In [42]:
duplicate_values = cleaned_data['content'][cleaned_data['content'].duplicated()]
print(len(duplicate_values)) 

0


### Removing duplicates

In [23]:
cleaned_data = cleaned_data.drop_duplicates(subset=["content"], keep="first")

### Randomly Sampling 1600 posts for annotations with fixed random state of 523 for reproducibility of code

In [31]:
sampled_data = cleaned_data.sample(n=1600, random_state=523) 

sampled_data['media_type'].value_counts()

media_type
article    900
image      544
video      156
Name: count, dtype: int64

### Checking duplicate posts in sampled dataset

In [32]:
duplicate_values = cleaned_data['content'][cleaned_data['content'].duplicated()]
print(len(duplicate_values)) 

0


### Removing irrelavnt columns and user info columns

In [33]:
columns_to_remove = ["name", "headline", "location", "about", "votes", "media_url"]

sampled_data = sampled_data.drop(columns=columns_to_remove, errors="ignore")

### Creating four different data files for annotators

"label" column:  The label assigned by annotator using schema

In [35]:
sampled_data['label'] = ''

# Annotation files for Muhammad, Kartik, Timothy, Zhengyi

members = ['Muhammad', 'Kartik', 'Timothy', 'Zhengyi']

split_data = np.array_split(sampled_data, 4)

# Defining the subsets of 4 data parts
subsets = [
    [0, 1, 2],  # Muhammad: data parts 1, 2, 3
    [0, 1, 3],  # Kartik: data parts 1, 2, 4
    [0, 2, 3],  # Timothy: data parts 1, 3, 4
    [1, 2, 3]   # Zhengyi: data parts 2, 3, 4
]

# Making 1200 instances in a file for each member covering a total of 1600 instances, each labelled three times
for i, subset in enumerate(subsets):
    combined_df = pd.concat([split_data[j] for j in subset], ignore_index=True)
    combined_df.rename(columns={"Unnamed: 0": "index"}, inplace=True)
    
    combined_df.to_csv(f"../data/Annotation_instances/linkedIn_data_{members[i]}.csv", index=False)

## Analyzing Posts Text

### Combining all posts into one text from our sampled data of 1600 instances

In [36]:
posts = " ".join(sampled_data['content'].astype(str))

### Number of tokens

In [37]:
tokens = word_tokenize(posts)

print(f"Total number of tokens: {len(tokens)}")

Total number of tokens: 95763


### Performing statistical analysis of influencer's posts with respect to brown corpus

In [38]:
brown_text = " ".join(brown.words()).lower()

In [39]:
def compute_statistics(text):
    words = nltk.word_tokenize(text)
    words = [w for w in words if w.isalpha()]
    
    num_words = len(words)
    num_sentences = text.count(".") 
    
    avg_word_length = np.mean([len(w) for w in words])
    vocab_size = len(set(words))
    lexical_diversity = vocab_size / num_words if num_words > 0 else 0
    hapax_legomena = sum(1 for w in Counter(words).values() if w == 1) 
    avg_sentence_length = num_words / num_sentences if num_sentences > 0 else 0
    type_token_ratio = vocab_size / num_words if num_words > 0 else 0
    
    return {
        "num_words": num_words,
        "num_sentences": num_sentences,
        "avg_word_length": avg_word_length,
        "vocab_size": vocab_size,
        "lexical_diversity": lexical_diversity,
        "hapax_legomena": hapax_legomena,
        "avg_sentence_length": avg_sentence_length,
        "type_token_ratio": type_token_ratio
    }

In [40]:
import pandas as pd

# Compute statistics for Brown Corpus and Posts
brown_stats = compute_statistics(brown_text)
posts_stats = compute_statistics(posts)

stats_data = {
    "Statistic": ["Number of Words", "Number of Sentences", "Average Word Length", "Vocabulary Size",
                  "Lexical Diversity", "Hapax Legomena", "Average Sentence Length", "Type-to-Token Ratio"],
    "Brown Corpus": [brown_stats["num_words"], brown_stats["num_sentences"], brown_stats["avg_word_length"],
                     brown_stats["vocab_size"], brown_stats["lexical_diversity"], brown_stats["hapax_legomena"],
                     brown_stats["avg_sentence_length"], brown_stats["type_token_ratio"]],
    "Posts": [posts_stats["num_words"], posts_stats["num_sentences"], posts_stats["avg_word_length"],
              posts_stats["vocab_size"], posts_stats["lexical_diversity"], posts_stats["hapax_legomena"],
              posts_stats["avg_sentence_length"], posts_stats["type_token_ratio"]]
}

df_stats = pd.DataFrame(stats_data)

print(df_stats)

                 Statistic   Brown Corpus         Posts
0          Number of Words  992910.000000  76184.000000
1      Number of Sentences   55578.000000   4938.000000
2      Average Word Length       4.678113      4.885580
3          Vocabulary Size   40669.000000  11846.000000
4        Lexical Diversity       0.040959      0.155492
5           Hapax Legomena   15538.000000   6558.000000
6  Average Sentence Length      17.865162     15.428109
7      Type-to-Token Ratio       0.040959      0.155492
