In [1]:
import pandas as pd
import os

# Define the path where all the files are stored
data_folder = "E:/tcd/Text analytics/Paper/text_analytics_data"

# List of all the file names provided by the user
file_names = [
    "books_textAnalytics.xlsx", "SimpleLiving.xlsx", "Liberal.xlsx", "Parenting.xlsx",
    "mentalhealth_textAnalytics.xlsx", "Atheism.xlsx", "Feminism.xlsx", "OutOfTheLoop_textAnalytics.xlsx",
    "relationships_textAnalytics.xlsx", "PoliticalDebate_textAnalytics.xlsx",
    "NeutralPolitics.xlsx", "changemyview_textAnalaytics.xlsx"
]

# Load all data files into a dictionary of DataFrames
dataframes = {}
for file in file_names:
    file_path = os.path.join(data_folder, file)
    df = pd.read_excel(file_path)
    dataframes[file.replace(".xlsx", "")] = df

# Display basic information about the loaded datasets
summary_info = {name: df.shape for name, df in dataframes.items()}
summary_info


{'books_textAnalytics': (537, 5),
 'SimpleLiving': (957, 5),
 'Liberal': (999, 5),
 'Parenting': (972, 5),
 'mentalhealth_textAnalytics': (989, 5),
 'Atheism': (963, 5),
 'Feminism': (990, 5),
 'OutOfTheLoop_textAnalytics': (986, 5),
 'relationships_textAnalytics': (894, 5),
 'PoliticalDebate_textAnalytics': (996, 5),
 'NeutralPolitics': (982, 5),
 'changemyview_textAnalaytics': (964, 5)}

Explanation: 
üîç What is Tentative Language?
Tentative language includes words or phrases that express uncertainty, possibility, or hesitation. Instead of stating something with full confidence, people using tentative language show that they are not entirely sure or are being careful about making bold claims.

üß† Examples of tentative words/phrases:

maybe

I think

possibly

seems

could

I suppose

not sure

These words soften a statement and often appear when someone is being polite, careful, open to discussion, or expressing a personal opinion.

üìå Why Did We Analyze Tentative Language?
In our research on Reddit communities, we're studying how language reflects group identity, inclusion, exclusion, and emotional tone.

Tentative language is important because it can:

Show how open or cautious users are in different communities

Help us compare support-oriented vs. ideological spaces

Reveal how people position themselves in conversations ‚Äî either confidently or uncertainly

üß™ How We Did It (The Method):
We created a list of common tentative phrases (e.g., maybe, perhaps, I think).

We scanned all Reddit posts and comments in each subreddit dataset.

For every comment, we counted:

How many tentative words it contained.

How many total words it had.

We then calculated the average number of tentative words per 1,000 words of text, so we could fairly compare subreddits of different lengths.

üìä What This Tells Us:
By comparing these averages, we can see which communities tend to be more cautious, empathetic, or uncertain in their language ‚Äî and which ones are more direct, assertive, or confident.

For example, a mental health subreddit might have higher tentative language (e.g., "I think this might help you") ‚Äî showing care, empathy, and non-judgment.

An ideological or debate subreddit might use less tentative and more assertive language ‚Äî signaling confidence or group alignment (e.g., "This is clearly wrong", "We must act").

This helps us understand how people communicate differently based on the type of community they‚Äôre in.



In [6]:
import pandas as pd
import re
import os
# Tentative and Assertive word lists
tentative_words = [
    "maybe", "perhaps", "possibly", "might", "could", "I think", "I guess", "not sure",
    "I feel like", "seems", "appears", "probably", "likely", "I suppose"
]
assertive_words = [
    "definitely", "certainly", "clearly", "obviously", "must", "always", "never", "undoubtedly",
    "without a doubt", "no question", "it's clear", "I know", "in fact"
]

# Compile regex patterns for speed
tentative_pattern = re.compile(r'\b(' + '|'.join(map(re.escape, tentative_words)) + r')\b', re.IGNORECASE)
assertive_pattern = re.compile(r'\b(' + '|'.join(map(re.escape, assertive_words)) + r')\b', re.IGNORECASE)

# Function to count tentative/assertive words
def count_tentative_assertive(text):
    if not isinstance(text, str):
        return 0, 0
    tentative_count = len(tentative_pattern.findall(text))
    assertive_count = len(assertive_pattern.findall(text))
    return tentative_count, assertive_count

# Analyze all files
results = []
for file in file_names:
    path = os.path.join(data_folder, file)
    df = pd.read_excel(path)

    # Use the 'body' column for text content
    df['tentative_count'], df['assertive_count'] = zip(*df['body'].fillna("").map(count_tentative_assertive))
    total_words = df['body'].fillna("").str.split().map(len)

    # Normalize counts per 1,000 words
    df['tentative_per_1000'] = df['tentative_count'] / total_words * 1000
    df['assertive_per_1000'] = df['assertive_count'] / total_words * 1000

    # Store average metrics
    results.append({
        'Subreddit': file.replace(".xlsx", ""),
        'Avg_Tentative_per_1000': df['tentative_per_1000'].mean(),
        'Avg_Assertive_per_1000': df['assertive_per_1000'].mean()
    })

# Create result DataFrame
summary_df = pd.DataFrame(results)

# Display results
print(summary_df)


                        Subreddit  Avg_Tentative_per_1000  \
0             books_textAnalytics                5.467418   
1                    SimpleLiving                4.693294   
2                         Liberal                6.414107   
3                       Parenting                4.961209   
4      mentalhealth_textAnalytics                6.577861   
5                         Atheism                5.386393   
6                        Feminism                4.888109   
7      OutOfTheLoop_textAnalytics                7.366483   
8     relationships_textAnalytics                5.152606   
9   PoliticalDebate_textAnalytics                5.770503   
10                NeutralPolitics                3.680489   
11    changemyview_textAnalaytics                6.508076   

    Avg_Assertive_per_1000  
0                 3.487895  
1                 3.369054  
2                 3.167521  
3                 4.504882  
4                 5.309452  
5                 4.143656  
6  

In [7]:
summary_df.to_csv("linguistic_feature_tentative_language_summary.csv", index=False)
print("‚úÖ Summary CSV saved as 'linguistic_feature_tentative_language_summary.csv'")

‚úÖ Summary CSV saved as 'linguistic_feature_tentative_language_summary.csv'


üß† Cognitive vs. Emotional Language Analysis
This method looks at whether communities use more:

üß† Cognitive words ‚Äî like think, know, understand, reason

‚ù§Ô∏è Emotional words ‚Äî like happy, sad, angry, love

It helps show whether a community leans more toward rational discussion or emotional expression ‚Äî a key difference between ideological, support, and hobby-based communities.


In [8]:

# Define emotional and cognitive word lists (you can expand these if needed)
emotional_words = [
    "happy", "sad", "angry", "love", "hate", "joy", "fear", "excited", "depressed",
    "worried", "upset", "proud", "frustrated", "disappointed", "grateful"
]

cognitive_words = [
    "think", "know", "understand", "realize", "believe", "because", "reason",
    "consider", "assume", "logic", "idea", "explain", "analyze", "decide", "conclude"
]

# Compile regex patterns for efficiency
emotional_pattern = re.compile(r'\b(' + '|'.join(map(re.escape, emotional_words)) + r')\b', re.IGNORECASE)
cognitive_pattern = re.compile(r'\b(' + '|'.join(map(re.escape, cognitive_words)) + r')\b', re.IGNORECASE)

# Function to count emotional and cognitive words
def count_emotional_cognitive(text):
    if not isinstance(text, str):
        return 0, 0
    emotional_count = len(emotional_pattern.findall(text))
    cognitive_count = len(cognitive_pattern.findall(text))
    return emotional_count, cognitive_count

# Run analysis and collect results
results = []
for file in file_names:
    path = os.path.join(data_folder, file)
    df = pd.read_excel(path)
    
    # Process the 'body' column containing text
    df['emotional_count'], df['cognitive_count'] = zip(*df['body'].fillna("").map(count_emotional_cognitive))
    total_words = df['body'].fillna("").str.split().map(len)

    df['emotional_per_1000'] = df['emotional_count'] / total_words * 1000
    df['cognitive_per_1000'] = df['cognitive_count'] / total_words * 1000

    results.append({
        'Subreddit': file.replace(".xlsx", ""),
        'Avg_Emotional_per_1000': df['emotional_per_1000'].mean(),
        'Avg_Cognitive_per_1000': df['cognitive_per_1000'].mean()
    })

# Display final summary
summary_df = pd.DataFrame(results)
print(summary_df)


                        Subreddit  Avg_Emotional_per_1000  \
0             books_textAnalytics                2.788764   
1                    SimpleLiving                5.166001   
2                         Liberal                1.643860   
3                       Parenting                3.783355   
4      mentalhealth_textAnalytics                5.562809   
5                         Atheism                2.642290   
6                        Feminism                2.386102   
7      OutOfTheLoop_textAnalytics                1.425974   
8     relationships_textAnalytics                4.812065   
9   PoliticalDebate_textAnalytics                0.776728   
10                NeutralPolitics                0.304071   
11    changemyview_textAnalaytics                1.316108   

    Avg_Cognitive_per_1000  
0                 8.208269  
1                 6.804397  
2                 9.560669  
3                 8.075761  
4                12.662897  
5                14.474337  
6  

In [9]:
summary_df.to_csv("linguistic_feature_cognitive_emotional_language_summary.csv", index=False)
print("‚úÖ Summary CSV saved as 'linguistic_feature_cognitive_emotional_language_summary.csv'")

‚úÖ Summary CSV saved as 'linguistic_feature_cognitive_emotional_language_summary.csv'


1. Politeness & Rudeness Detection
What it measures:
Counts how often polite or rude words appear in comments/posts, normalized per 1,000 words.

Why it matters:
Shows how respectful, empathetic, or confrontational a community is in its language.

Examples of words used:

Polite: please, thank you, sorry, appreciate

Rude: idiot, dumb, shut up, nonsense

What it tells us:
Communities like r/relationships or r/mentalhealth may use more polite expressions. Ideological or debate-focused subreddits may show more harsh or blunt expressions.

2. Question Usage
What it measures:
Counts how many question marks ? appear per 1,000 words of text.

Why it matters:
Shows how inquisitive or conversational a subreddit is. High question frequency often means people are curious, seeking info, or trying to engage discussion.

What it tells us:
Support and discussion-based subreddits (like r/OutOfTheLoop or r/Parenting) may contain more questions. Debate subreddits might have rhetorical or challenge-style questions.

3. Lexical Diversity (Vocabulary Richness)
What it measures:
The variety of words used ‚Äî calculated as the number of unique words / total words (called the type-token ratio).

Why it matters:
It shows how varied and rich the language is. High diversity often means thoughtful, original responses. Low diversity can reflect repetitiveness or echo chambers.

What it tells us:
Communities like r/books or r/SimpleLiving may have richer vocabularies. Others may reuse common language or memes more often.

In [4]:

# Word lists
polite_words = ["please", "thank you", "thanks", "sorry", "appreciate", "kind", "welcome", "respect", "helpful"]
rude_words = ["stupid", "idiot", "dumb", "shut up", "nonsense", "ignorant", "moron", "hate", "useless"]

# Compile regex for speed
polite_pattern = re.compile(r'\b(' + '|'.join(map(re.escape, polite_words)) + r')\b', re.IGNORECASE)
rude_pattern = re.compile(r'\b(' + '|'.join(map(re.escape, rude_words)) + r')\b', re.IGNORECASE)

# Count polite and rude words
def count_polite_rude(text):
    if not isinstance(text, str):
        return 0, 0
    polite = len(polite_pattern.findall(text))
    rude = len(rude_pattern.findall(text))
    return polite, rude

# Count question marks
def count_questions(text):
    if not isinstance(text, str):
        return 0
    return text.count('?')

# Calculate lexical diversity (type-token ratio)
def lexical_diversity(text):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return 0
    words = text.lower().split()
    return len(set(words)) / len(words) if words else 0

# Run analysis on each file
results = []
for file in file_names:
    path = os.path.join(data_folder, file)
    df = pd.read_excel(path)
    df['body'] = df['body'].fillna("")

    # Apply all 3 functions
    df['polite_count'], df['rude_count'] = zip(*df['body'].map(count_polite_rude))
    df['question_count'] = df['body'].map(count_questions)
    df['lexical_diversity'] = df['body'].map(lexical_diversity)
    
    # Normalize per 1000 words
    total_words = df['body'].str.split().map(len)
    df['polite_per_1000'] = df['polite_count'] / total_words * 1000
    df['rude_per_1000'] = df['rude_count'] / total_words * 1000
    df['questions_per_1000'] = df['question_count'] / total_words * 1000

    results.append({
        'Subreddit': file.replace(".xlsx", ""),
        'Avg_Polite_per_1000': df['polite_per_1000'].mean(),
        'Avg_Rude_per_1000': df['rude_per_1000'].mean(),
        'Avg_Questions_per_1000': df['questions_per_1000'].mean(),
        'Avg_Lexical_Diversity': df['lexical_diversity'].mean()
    })

# Create final summary DataFrame
summary_df = pd.DataFrame(results)
print(summary_df)


                        Subreddit  Avg_Polite_per_1000  Avg_Rude_per_1000  \
0             books_textAnalytics             4.917442           0.320266   
1                    SimpleLiving             2.954352           0.438402   
2                         Liberal             3.542858           0.814871   
3                       Parenting             2.458847           0.329102   
4      mentalhealth_textAnalytics             3.008317           1.681955   
5                         Atheism             2.148128           1.390569   
6                        Feminism             3.578229           0.537317   
7      OutOfTheLoop_textAnalytics             2.568467           0.716645   
8     relationships_textAnalytics             1.894586           0.425835   
9   PoliticalDebate_textAnalytics             1.244648           0.446062   
10                NeutralPolitics             1.320073           0.123185   
11    changemyview_textAnalaytics             1.166852           0.609144   

In [5]:
# Save summary_df to CSV
summary_df.to_csv("linguistic_feature_summary.csv", index=False)
print("‚úÖ Summary CSV saved as 'linguistic_feature_summary.csv'")


‚úÖ Summary CSV saved as 'linguistic_feature_summary.csv'
