In [1]:
import os
import pandas as pd

# Step 1: Load Data
file_path = 'npi.txt'

# Check if the file exists
if not os.path.exists(file_path):
    print(f"Error: The file '{file_path}' does not exist.")
else:
    try:
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            data = file.readlines()
    except Exception as e:
        print(f"Error reading the file: {e}")
    else:
        # Step 2: Create DataFrame
        data = [line.strip() for line in data]
        df = pd.DataFrame(data, columns=['text'])

        # Display the first few rows of the DataFrame
        print(df.head())

          
        

        # Step 3: Data Exploration
        print(df.info())
        print(df.describe())
        print(df['text'].head(10))


                                                text
0  Who?\tको?\tCC-BY 2.0 (France) Attribution: tat...
1  Hide.\tलुकाउनुहोस्।\tCC-BY 2.0 (France) Attrib...
2  Hide.\tलुक।\tCC-BY 2.0 (France) Attribution: t...
3  Stay.\tबस्नुहोस्।\tCC-BY 2.0 (France) Attribut...
4  Hello!\tनमस्ते!\tCC-BY 2.0 (France) Attributio...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2689 entries, 0 to 2688
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2689 non-null   object
dtypes: object(1)
memory usage: 21.1+ KB
None
                                                     text
count                                                2689
unique                                               2689
top     Who?\tको?\tCC-BY 2.0 (France) Attribution: tat...
freq                                                    1
0    Who?\tको?\tCC-BY 2.0 (France) Attribution: tat...
1    Hide.\tलुकाउनुहोस्।\tCC-BY 2.0 (France) Attrib...
2    Hide.\tलुक।\tCC-B

In [2]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Clean the text data
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['text'] = df['text'].apply(clean_text)

# Vectorization using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['text']).toarray()

# Assuming we have labels in the data (you need to adjust this according to your actual data)
# For demonstration, I'm generating random labels
import numpy as np
df['label'] = np.random.randint(0, 2, df.shape[0])  # Replace this with actual labels if available

y = df['label']

# Step 4: Splitting Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Step 5: Define Model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 6: Evaluate Model
y_pred = model.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred, average="weighted")}')
print(f'Recall: {recall_score(y_test, y_pred, average="weighted")}')
print(f'F1 Score: {f1_score(y_test, y_pred, average="weighted")}')
print(f'ROC AUC Score: {roc_auc_score(y_test, y_pred)}')


Accuracy: 0.4758364312267658
Precision: 0.4762152976265822
Recall: 0.4758364312267658
F1 Score: 0.47560458059160454
ROC AUC Score: 0.476162830879812


In [4]:
import joblib

# Step 7: Save Model
joblib.dump(model, 'text_classification_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [5]:
import joblib
import numpy as np

# Load the saved model and vectorizer
model = joblib.load('text_classification_model.pkl')
tfidf = joblib.load('tfidf_vectorizer.pkl')

# Sample new data
new_data = ["Sample text for prediction."]

# Clean the new data
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

new_data_cleaned = [clean_text(text) for text in new_data]

# Transform the new data using the loaded vectorizer
X_new = tfidf.transform(new_data_cleaned).toarray()

# Make predictions
predictions = model.predict(X_new)
print(f'Predictions: {predictions}')


Predictions: [1]


In [6]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for Logistic Regression
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga']
}

# Initialize Grid Search with cross-validation
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy')

# Fit Grid Search to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print(f'Best Parameters: {best_params}')
print(f'Best Model: {best_model}')

# Save the best model
joblib.dump(best_model, 'best_text_classification_model.pkl')


Best Parameters: {'C': 10, 'solver': 'liblinear'}
Best Model: LogisticRegression(C=10, solver='liblinear')


['best_text_classification_model.pkl']

In [7]:
from flask import Flask, request, jsonify
import joblib

app = Flask(__name__)

# Load model and vectorizer
try:
    model = joblib.load('text_classification_model.pkl')
    vectorizer = joblib.load('tfidf_vectorizer.pkl')
except Exception as e:
    print(f"Error loading model or vectorizer: {e}")
    exit(1)

@app.route('/predict', methods=['POST'])
def predict():
    try:
        data = request.json
        text = data['text']
        transformed_text = vectorizer.transform([text])
        prediction = model.predict(transformed_text)
        return jsonify({'prediction': int(prediction[0])})
    except Exception as e:
        return jsonify({'error': str(e)})

if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with stat


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [9]:
import re

# Function to load translation data from the file
def load_translation_data(file_path):
    translation_dict = {}
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        lines = file.readlines()
    
    for line in lines:
        # Assuming the file format is: English text\tNepali text\t...
        parts = line.split('\t')
        if len(parts) >= 2:
            english_text = parts[0].strip().lower()
            nepali_text = parts[1].strip()
            translation_dict[english_text] = nepali_text
    
    return translation_dict

# Function to translate English text to Nepali
def translate_to_nepali(word, translation_dict):
    return translation_dict.get(word.lower(), "Translation not found")

def main():
    # Load translation data from the file
    translation_dict = load_translation_data('npi.txt')
    
    # Ask user for input
    user_input = input("Enter an English word or phrase to translate to Nepali: ").strip()

    # Translate the input
    nepali_translation = translate_to_nepali(user_input, translation_dict)
    
    # Display the result
    print(f"English: {user_input}")
    print(f"Nepali: {nepali_translation}")

if __name__ == "__main__":
    main()


English: Hide.
Nepali: लुक।
