# Assignment 1: : Text Preprocessing

In [None]:
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
df = pd.read_csv(r"C:\Users\Rishu\Desktop\Parse_effect_soc\IMDB Dataset.csv").sample(n=10000, random_state=42)

# Case Normalisation
df['review'] = df['review'].str.lower()

# Handling HTML Tags & URLs
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

df['review'] = df['review'].apply(lambda x: remove_html_tags(x))
df['review'] = df['review'].apply(lambda x: re.sub(r'http\S+', '', x))

# Removing punctuations
df['review'] = df['review'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# Stop Words Removal
stop_words = set(stopwords.words('english'))
df['review'] = df['review'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))

# Chatwords Handling
chat_words = {'u': 'you', 'ur': 'your', 'r': 'are'}
df['review'] = df['review'].apply(lambda x: ' '.join([chat_words[word] if word in chat_words else word for word in word_tokenize(x)]))

# Emoji handling
def remove_emojis(text):
    return text.encode('ascii', 'ignore').decode('ascii')

df['review'] = df['review'].apply(lambda x: remove_emojis(x))

# Tokenization
df['tokens'] = df['review'].apply(lambda x: word_tokenize(x))

# Stemming and Lemmatization
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

df['stemmed'] = df['tokens'].apply(lambda x: [ps.stem(word) for word in x])
df['lemmatized'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Display the preprocessed text
print(df.head())


# Assignment 2: Text Representation

In [None]:
from collections import Counter
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Problem 1: Word Count and Unique Words
word_list = ' '.join(df['review']).split()
word_count = len(word_list)
unique_words = len(set(word_list))
print(f'Total Words: {word_count}, Unique Words: {unique_words}')

# Problem 2: One-Hot Encoding
one_hot_encoder = OneHotEncoder()
one_hot_encoded = one_hot_encoder.fit_transform(df['review'].values.reshape(-1, 1))
print(one_hot_encoded.toarray())

# Problem 3: Bag of Words
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(df['review'])
print(vectorizer.vocabulary_)
print(X_bow.toarray())

# Problem 4: Bag of Bi-gram and Tri-gram
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
X_bigram = bigram_vectorizer.fit_transform(df['review'])
print(bigram_vectorizer.vocabulary_)

trigram_vectorizer = CountVectorizer(ngram_range=(3, 3))
X_trigram = trigram_vectorizer.fit_transform(df['review'])
print(trigram_vectorizer.vocabulary_)

# Problem 5: TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['review'])
idf_scores = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.idf_))
print(idf_scores)


# Assignment 3: Word2Vec Implementation

In [None]:
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Problem 1: Data Preparation is already done in Assignment 1

# Problem 2: Training Word2Vec Model
sentences = df['tokens'].tolist()
model_cbow = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=0)
model_skipgram = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1)

# Problem 3: Exploring Word Embeddings
print(model_cbow.wv.most_similar('good'))
print(model_skipgram.wv.most_similar('good'))
print(model_cbow.wv.most_similar(positive=['king', 'woman'], negative=['man']))

# Problem 4: Visualization of Word Embeddings
def plot_words(model):
    X = model.wv[model.wv.index_to_key]
    pca = PCA(n_components=2)
    result = pca.fit_transform(X)
    plt.scatter(result[:, 0], result[:, 1])
    words = list(model.wv.index_to_key)
    for i, word in enumerate(words):
        plt.annotate(word, xy=(result[i, 0], result[i, 1]))
    plt.show()

plot_words(model_cbow)
plot_words(model_skipgram)


# Assignment 4: Text Classification

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Problem 1: Data Preparation is already done in Assignment 1

# Problem 2: Feature Extraction is already done in Assignment 2

# Splitting data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['sentiment'], test_size=0.2, random_state=42)

# Problem 3: Model Building
models = {
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": MultinomialNB(),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

# Problem 4: Model Evaluation
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{name} Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"{name} Confusion Matrix:\n {confusion_matrix(y_test, y_pred)}")
    print(f"{name} Classification Report:\n {classification_report(y_test, y_pred)}")

# Problem 5: Model Tuning can be done using GridSearchCV or RandomizedSearchCV


# Assignment 5: POS Tagging

In [None]:
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
import spacy

# Problem 1: Data Preparation is already done in Assignment 1

# Problem 2: POS Tagging using NLTK
nltk.download('averaged_perceptron_tagger')
df['pos_tags'] = df['tokens'].apply(lambda x: pos_tag(x))
print(df['pos_tags'].head())

# Problem 3: Custom POS Tagging
# Here, we create a simple rule-based POS tagger
custom_tags = []
for tokens in df['tokens']:
    tags = []
    for word in tokens:
        if word.endswith('ing'):
            tags.append((word, 'VBG'))  # Gerund
        elif word.endswith('ed'):
            tags.append((word, 'VBD'))  # Past tense
        else:
            tags.append((word, 'NN'))  # Noun
    custom_tags.append(tags)
df['custom_pos_tags'] = custom_tags
print(df['custom_pos_tags'].head())

# Problem 4: Advanced POS Tagging using SpaCy
nlp = spacy.load('en_core_web_sm')
df['spacy_pos_tags'] = df['review'].apply(lambda x: [(token.text, token.pos_) for token in nlp(x)])
print(df['spacy_pos_tags'].head())

# Problem 5: Application of POS Tagging
df['nouns'] = df['pos_tags'].apply(lambda x: [word for word, pos in x if pos.startswith('NN')])
print(df['nouns'].head())
