## Libraries and basic NLP


In [34]:
### Install necessary libraries
!pip install nltk spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [35]:
### Import necessary libraries
import nltk
import spacy
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
from nltk.chunk import ne_chunk
from nltk.tree import Tree

In [36]:
# Download necessary datasets
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/gthakkar/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gthakkar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gthakkar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/gthakkar/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /home/gthakkar/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to /home/gthakkar/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [37]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [38]:

### Sample text
txt = "Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence. It helps computers understand human language. Google, Amazon, and Apple use NLP in their products."


In [39]:

### Tokenization
print("\nTokenization:")
print("Word Tokenization:", word_tokenize(txt))
print("Sentence Tokenization:", sent_tokenize(txt))



Tokenization:
Word Tokenization: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'a', 'fascinating', 'field', 'of', 'Artificial', 'Intelligence', '.', 'It', 'helps', 'computers', 'understand', 'human', 'language', '.', 'Google', ',', 'Amazon', ',', 'and', 'Apple', 'use', 'NLP', 'in', 'their', 'products', '.']
Sentence Tokenization: ['Natural Language Processing (NLP) is a fascinating field of Artificial Intelligence.', 'It helps computers understand human language.', 'Google, Amazon, and Apple use NLP in their products.']


In [40]:
### Stopwords Removal
stop_words = set(stopwords.words('english'))
tokens = word_tokenize(txt)
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("\nAfter Stopwords Removal:", filtered_tokens)



After Stopwords Removal: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'fascinating', 'field', 'Artificial', 'Intelligence', '.', 'helps', 'computers', 'understand', 'human', 'language', '.', 'Google', ',', 'Amazon', ',', 'Apple', 'use', 'NLP', 'products', '.']


In [41]:
### Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_tokens]
print("\nAfter Stemming:", stemmed_words)



After Stemming: ['natur', 'languag', 'process', '(', 'nlp', ')', 'fascin', 'field', 'artifici', 'intellig', '.', 'help', 'comput', 'understand', 'human', 'languag', '.', 'googl', ',', 'amazon', ',', 'appl', 'use', 'nlp', 'product', '.']


In [42]:
### Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("\nAfter Lemmatization:", lemmatized_words)



After Lemmatization: ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'fascinating', 'field', 'Artificial', 'Intelligence', '.', 'help', 'computer', 'understand', 'human', 'language', '.', 'Google', ',', 'Amazon', ',', 'Apple', 'use', 'NLP', 'product', '.']


In [43]:
### Part-of-Speech (POS) Tagging
pos_tags = pos_tag(word_tokenize(txt))
print("\nPOS Tagging:", pos_tags)



POS Tagging: [('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('(', '('), ('NLP', 'NNP'), (')', ')'), ('is', 'VBZ'), ('a', 'DT'), ('fascinating', 'JJ'), ('field', 'NN'), ('of', 'IN'), ('Artificial', 'JJ'), ('Intelligence', 'NNP'), ('.', '.'), ('It', 'PRP'), ('helps', 'VBZ'), ('computers', 'NNS'), ('understand', 'VBP'), ('human', 'JJ'), ('language', 'NN'), ('.', '.'), ('Google', 'NNP'), (',', ','), ('Amazon', 'NNP'), (',', ','), ('and', 'CC'), ('Apple', 'NNP'), ('use', 'VBP'), ('NLP', 'NNP'), ('in', 'IN'), ('their', 'PRP$'), ('products', 'NNS'), ('.', '.')]


In [44]:
### Named Entity Recognition (NER) using NLTK
chunked = ne_chunk(pos_tags)
print("\nNamed Entities (NLTK):")
for subtree in chunked:
    if isinstance(subtree, Tree):
        print(" ".join(word for word, tag in subtree.leaves()), "->", subtree.label())



Named Entities (NLTK):
NLP -> ORGANIZATION
Artificial Intelligence -> ORGANIZATION
Google -> PERSON
Amazon -> GPE
NLP -> ORGANIZATION


In [45]:
### Named Entity Recognition (NER) using spaCy
doc = nlp(txt)
print("\nNamed Entities (spaCy):")
for ent in doc.ents:
    print(ent.text, "->", ent.label_)


Named Entities (spaCy):
Natural Language Processing -> ORG
NLP -> ORG
Artificial Intelligence -> ORG
Amazon -> ORG
Apple -> ORG
NLP -> ORG


In [46]:
import string

In [47]:

"""
### Cleaning Text

Text preprocessing involves cleaning and normalizing the text data.
"""

def clean_text(text):
    # Remove special characters and URLs
    text = ''.join([c for c in text if c not in string.punctuation])
    text = text.replace('[', '').replace(']', '')
    text = text.lower()
    # Remove numbers
    text = ''.join([c for c in text if not c.isdigit()])
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

sample_text = "Hello!   This is a sample text. It has  numbers like 123 and special characters like @#$."
print("Original Text:")
print(sample_text)
print("\nCleaned Text:")
print(clean_text(sample_text))

Original Text:
Hello!   This is a sample text. It has  numbers like 123 and special characters like @#$.

Cleaned Text:
hello this is a sample text it has numbers like and special characters like


## Sentiment analysis

In [48]:
!pip install textblob




In [49]:
from textblob import TextBlob
"""
### Sentiment Analysis

TextBlob can be used for sentiment analysis. It returns polarity and subjectivity.
"""

blob = TextBlob("This was a fantastic experience!")
print("Polarity (Sentiment):", blob.sentiment.polarity)  # 1.0 is positive
print("Subjectivity:", blob.sentiment.subjectivity)     # 1.0 is very subjective

text2 = "I am disappointed with the service."
blob2 = TextBlob(text2)
print("\nPolarity:", blob2.sentiment.polarity)
print("Subjectivity:", blob2.sentiment.subjectivity)



Polarity (Sentiment): 0.5
Subjectivity: 0.9

Polarity: -0.75
Subjectivity: 0.75


## File manipulations

In [50]:
# Create a sample text file
with open("sample.txt", "w") as file:
    file.write("Natural Language Processing (NLP) is a fascinating field of AI.\n")
    file.write("It enables computers to understand, interpret, and generate human language.\n")
    file.write("This text file serves as an example for NLP processing.\n")

print("input.txt has been created successfully!")


input.txt has been created successfully!


In [51]:
"""
### Reading Lines from a Text File

We can read a text file line by line using the `readlines()` method.
"""

with open('sample.txt', 'r') as file:
    lines = file.readlines()

print("Lines in the file:")
for line in lines:
    print(line.strip())

Lines in the file:
Natural Language Processing (NLP) is a fascinating field of AI.
It enables computers to understand, interpret, and generate human language.
This text file serves as an example for NLP processing.


In [52]:
"""
### Introduction to CSV Files

CSV (Comma-Separated Values) files are widely used for storing tabular data. We'll use the `pandas` library for CSV manipulation.

First, let's install `pandas` if you haven't already.
"""


!pip install pandas



In [53]:
## Reading a CSV File
"""
### Reading a CSV File

Let's read a sample CSV file using `pandas`.
"""

import pandas as pd


## Writing to a CSV File
"""
### Writing to a CSV File

We can create a new CSV file from a DataFrame.
"""

# Create a DataFrame
data = {
    'Name': ['David', 'Eve', 'Frank'],
    'Age': [28, 32, 40],
    'City': ['Miami', 'Seattle', 'Boston']
}

new_df = pd.DataFrame(data)
print("\nNew DataFrame:")
print(new_df)

# Write the DataFrame to a CSV file
new_df.to_csv('output.csv', index=False)
print("\nNew CSV file created successfully.")






New DataFrame:
    Name  Age     City
0  David   28    Miami
1    Eve   32  Seattle
2  Frank   40   Boston

New CSV file created successfully.


In [54]:
# Cell 4: Advanced CSV Operations

## Adding Columns to a CSV File
"""
### Adding Columns to a CSV File

We can add new columns to an existing DataFrame and save the updated data to a CSV file.
"""

# Read the CSV file
df = pd.read_csv('output.csv')
print("Original DataFrame:")
print(df)

# Add a new column
df['Employed'] = ['Yes', 'No', 'Yes']
print("\nDataFrame after adding a column:")
print(df)

# Write the updated DataFrame to a new CSV file
df.to_csv('updated_output.csv', index=False)
print("\nUpdated CSV file created successfully.")

## Filtering Rows
"""
### Filtering Rows

We can filter rows in a DataFrame based on conditions.
"""

# Filter rows where Age > 30
filtered_df = df[df['Age'] > 30]
print("Filtered DataFrame:")
print(filtered_df)

## Calculating Totals
"""
### Calculating Totals

We can perform calculations on the data, such as calculating the total age.
"""

total_age = df['Age'].sum()
print("Total Age:", total_age)

Original DataFrame:
    Name  Age     City
0  David   28    Miami
1    Eve   32  Seattle
2  Frank   40   Boston

DataFrame after adding a column:
    Name  Age     City Employed
0  David   28    Miami      Yes
1    Eve   32  Seattle       No
2  Frank   40   Boston      Yes

Updated CSV file created successfully.
Filtered DataFrame:
    Name  Age     City Employed
1    Eve   32  Seattle       No
2  Frank   40   Boston      Yes
Total Age: 100


## CSV manipulation

"""
### Welcome! 

In this notebook, we'll learn how to:
1. Read and manipulate text data from CSV files
2. Preprocess text data for NLP tasks
3. Handle labels (e.g., sentiment, categories)
4. Extract insights from text data

This is essential for tasks like:
- Sentiment analysis
- Text classification
- Named Entity Recognition (NER)
- Topic modeling

Let's get started!



In [55]:
# Cell 2: Installing Required Libraries

### Install Libraries


!pip install pandas nltk scikit-learn
!python -m nltk downloadpunkt
!python -m nltk download stopwords


/home/gthakkar/miniconda3/bin/python: No module named nltk.__main__; 'nltk' is a package and cannot be directly executed
/home/gthakkar/miniconda3/bin/python: No module named nltk.__main__; 'nltk' is a package and cannot be directly executed


In [56]:
# Cell 3: Importing Libraries
"""
### Import Libraries
"""

import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
import string

In [57]:


# Cell 4: Create Sample CSV Data
"""
### Create Sample CSV Data

We'll create a sample CSV file for our exercises.
"""

# Sample data
data = {
    'text': [
        "This is a sample sentence for NLP.",
        "We are learning text preprocessing in Python.",
        "Natural Language Processing is fun and challenging.",
        "I love pizza! It's my favorite food.",
        "Machine learning is exciting, but sometimes complicated.",
        "Weather is nice today, perfect for a walk.",
        "Python is great for NLP tasks.",
        "Text data can be noisy and unstructured.",
        "We need to clean and preprocess the text data.",
        "NLP is the future of AI and human-computer interaction."
    ],
    'label': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
}

# Create DataFrame
df = pd.DataFrame(data)
# Save to CSV
df.to_csv('sample_nlp_data.csv', index=False)

print("Sample CSV file created successfully.")








Sample CSV file created successfully.


In [58]:
# Cell 5: Reading a CSV File
"""
### Read a CSV File

We'll read the sample CSV file using `pandas`.
"""

df = pd.read_csv('sample_nlp_data.csv')
print("Data loaded from CSV file:")
print(df)



Data loaded from CSV file:
                                                text  label
0                 This is a sample sentence for NLP.      0
1      We are learning text preprocessing in Python.      1
2  Natural Language Processing is fun and challen...      0
3               I love pizza! It's my favorite food.      1
4  Machine learning is exciting, but sometimes co...      0
5         Weather is nice today, perfect for a walk.      1
6                     Python is great for NLP tasks.      0
7           Text data can be noisy and unstructured.      1
8     We need to clean and preprocess the text data.      0
9  NLP is the future of AI and human-computer int...      1


In [59]:
# Cell 6: Handling Missing Text Data
"""
### Handling Missing Text Data

Real-world text data often has missing values. Let's see how to handle them.
"""

# Introduce missing values
df.loc[[2, 5], 'text'] = np.nan


print("DataFrame with missing values:")
print(df)

# Handle missing values by replacing with an empty string
df['text'].fillna('', inplace=True)

print("\nDataFrame after filling missing values:")
print(df)


DataFrame with missing values:
                                                text  label
0                 This is a sample sentence for NLP.      0
1      We are learning text preprocessing in Python.      1
2                                                NaN      0
3               I love pizza! It's my favorite food.      1
4  Machine learning is exciting, but sometimes co...      0
5                                                NaN      1
6                     Python is great for NLP tasks.      0
7           Text data can be noisy and unstructured.      1
8     We need to clean and preprocess the text data.      0
9  NLP is the future of AI and human-computer int...      1

DataFrame after filling missing values:
                                                text  label
0                 This is a sample sentence for NLP.      0
1      We are learning text preprocessing in Python.      1
2                                                         0
3               I love pizza

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['text'].fillna('', inplace=True)


In [60]:
# Cell 7: Extracting Text and Labels
"""
### Extracting Text and Labels

In NLP, we often work with text and its corresponding labels (e.g., sentiment, category).
"""

# Extract text column
texts = df['text'].tolist()
print("Sample texts:")
print(texts[:3])

# Extract labels
labels = df['label'].tolist()
print("\nSample labels:")
print(labels[:3])




Sample texts:
['This is a sample sentence for NLP.', 'We are learning text preprocessing in Python.', '']

Sample labels:
[0, 1, 0]


In [61]:
# Cell 8: Preprocessing Text Data
"""
### Preprocessing Text Data

Text preprocessing is an essential step in NLP. We'll perform the following steps:
1. Lowercasing
2. Removing special characters and punctuation
3. Removing stop words
4. Stemming or lemmatization
"""

# Example text
sample_text = "This is a sample sentence for NLP. Natural Language Processing is fun!"

# Step 1: Lowercasing
lower_text = sample_text.lower()
print("Lowercase text:", lower_text)

# Step 2: Removing punctuation
clean_text = re.sub(r'[^\w\s]', '', lower_text)
print("\nText without punctuation:", clean_text)

# Step 3: Tokenization
tokens = word_tokenize(clean_text)
print("\nTokens:", tokens)

# Step 4: Removing stop words
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token not in stop_words]
print("\nFiltered tokens:", filtered_tokens)

# Step 5: Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
print("\nStemmed tokens:", stemmed_tokens)

# Step 6: Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
print("\nLemmatized tokens:", lemmatized_tokens)


Lowercase text: this is a sample sentence for nlp. natural language processing is fun!

Text without punctuation: this is a sample sentence for nlp natural language processing is fun

Tokens: ['this', 'is', 'a', 'sample', 'sentence', 'for', 'nlp', 'natural', 'language', 'processing', 'is', 'fun']

Filtered tokens: ['sample', 'sentence', 'nlp', 'natural', 'language', 'processing', 'fun']

Stemmed tokens: ['sampl', 'sentenc', 'nlp', 'natur', 'languag', 'process', 'fun']

Lemmatized tokens: ['sample', 'sentence', 'nlp', 'natural', 'language', 'processing', 'fun']


In [62]:
# Cell 9: Preprocessing All Rows in the DataFrame
"""
### Preprocessing All Rows in the DataFrame

We'll apply the preprocessing steps to the entire text column.
"""

def preprocess_text(text):
    if text == '':
        return ''
    # Lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'_', '', text)
    text = re.sub(r'\s+', ' ', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    # Join tokens back into a string
    return ' '.join(lemmatized_tokens)

# Apply preprocessing to the entire text column
df['preprocessed_text'] = df['text'].apply(preprocess_text)

print("Preprocessed text data:")
print(df[['text', 'preprocessed_text']].head())

Preprocessed text data:
                                                text  \
0                 This is a sample sentence for NLP.   
1      We are learning text preprocessing in Python.   
2                                                      
3               I love pizza! It's my favorite food.   
4  Machine learning is exciting, but sometimes co...   

                                 preprocessed_text  
0                              sample sentence nlp  
1               learning text preprocessing python  
2                                                   
3                         love pizza favorite food  
4  machine learning exciting sometimes complicated  


In [63]:
# Cell 10: Handling Special Characters and URLs
"""
### Handling Special Characters and URLs

Text data often contains URLs, emojis, or special characters that need to be cleaned.
"""

# Example text with special characters and URLs
text_with_urls = "Check out this link: https://example.com. #cool"

# Remove URLs
text_without_urls = re.sub(r'http\S+', '', text_with_urls)

# Remove special characters
text_clean = re.sub(r'[^\w\s]', '', text_without_urls)
text_clean = re.sub(r'_', '', text_clean)

print("Original text:", text_with_urls)
print("Cleaned text:", text_clean)


Original text: Check out this link: https://example.com. #cool
Cleaned text: Check out this link  cool


In [64]:
# Cell 11: TF-IDF Vectorization for Text Features
"""
### TF-IDF Vectorization

We'll convert text data into numerical features using TF-IDF (Term Frequency-Inverse Document Frequency).
"""

from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=1000)

# Fit and transform the text data
tfidf_matrix = vectorizer.fit_transform(df['preprocessed_text'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)

# Get feature names
feature_names = vectorizer.get_feature_names_out()
print("\nFeature names (words):")
print(feature_names[:10])


TF-IDF matrix shape: (10, 27)

Feature names (words):
['ai' 'clean' 'complicated' 'data' 'exciting' 'favorite' 'food' 'future'
 'great' 'humancomputer']


In [65]:
# Cell 12: Sentiment Analysis as a Use Case
"""
### Sentiment Analysis

We'll use the preprocessed text data to perform basic sentiment analysis using `TextBlob`.
"""

from textblob import TextBlob

# Create a new TextBlob object for each text
df['sentiment'] = df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)

print("Sentiment scores:")
print(df[['text', 'sentiment']].head())



Sentiment scores:
                                                text  sentiment
0                 This is a sample sentence for NLP.     0.0000
1      We are learning text preprocessing in Python.     0.0000
2                                                        0.0000
3               I love pizza! It's my favorite food.     0.5625
4  Machine learning is exciting, but sometimes co...    -0.1000


In [66]:
# Cell 13: Text Classification (Labels)
"""
### Text Classification

We'll demonstrate how to split the data into train and test sets for classification tasks.
"""

from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    tfidf_matrix,
    df['label'],
    test_size=0.2,
    random_state=42
)

print("Train set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Train set shape: (8, 27)
Test set shape: (2, 27)


# Cell 14: Conclusion and Resources
"""
### Conclusion

In this notebook, you learned:
1. How to read and manipulate text data from CSV files
2. How to preprocess text data for NLP tasks
3. How to handle labels and perform sentiment analysis
4. How to convert text data into numerical features (TF-IDF)
5. How to prepare data for text classification

### Resources for Further Learning
1. NLTK Documentation: https://www.nltk.org/
2. scikit-learn TfidfVectorizer: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
3. TextBlob Documentation: https://textblob.readthedocs.io/en/dev/
4. Sentiment Analysis Tutorial: https://realpython.com/python-textblob/

Happy coding!
"""